// CuNNy 4x8C BILINEAR RGB NVL DN - https://github.com/funnyplanter/CuNNy

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-DN-D08N04

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t3;

//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0, t1

#define l0(x, y) min16float((dot(float3(2.214e-01, 4.385e-01, 1.006e-01), O(INPUT, float2(x, y)).rgb) + -6.858e-01))

V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
	V4 r = 0.0;
	r += V4(-2.401e-02, 1.817e-03, -1.218e-01, 2.796e-02) * s0_0;
	r += V4(3.256e-02, 3.929e-03, -5.850e-02, -5.602e-02) * s0_1;
	r += V4(4.497e-04, -1.812e-02, 5.241e-02, 3.698e-02) * s0_2;
	r += V4(5.371e-01, -2.302e-01, -1.373e-01, -4.038e-03) * s0_3;
	r += V4(1.565e-01, -6.067e-02, 3.397e-01, -3.741e-01) * s0_4;
	r += V4(-2.095e-03, 4.044e-02, -3.770e-02, 5.665e-02) * s0_5;
	r += V4(-1.993e-01, -2.645e-01, -8.892e-02, 1.948e-02) * s0_6;
	r += V4(-4.865e-01, 5.400e-01, -1.396e-01, 1.270e-01) * s0_7;
	r += V4(-1.667e-02, -9.433e-03, -1.324e-02, -1.803e-03) * s0_8;
	r += V4(2.880e-04, 1.418e-02, 1.413e-02, -1.036e-01);
	return r;
}

V4 f1(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
	V4 r = 0.0;
	r += V4(-4.610e-02, -6.199e-01, 8.493e-03, -1.532e-02) * s0_0;
	r += V4(-7.178e-02, 5.957e-01, 1.575e-03, 1.807e-02) * s0_1;
	r += V4(1.106e-01, 3.625e-03, 3.713e-02, -4.124e-03) * s0_2;
	r += V4(1.288e-01, -5.582e-02, 5.082e-02, 1.674e-02) * s0_3;
	r += V4(-6.074e-01, 8.818e-02, -3.371e-01, -6.663e-01) * s0_4;
	r += V4(-8.030e-02, -4.780e-03, -3.421e-01, 5.358e-02) * s0_5;
	r += V4(4.990e-01, 7.623e-03, 1.778e-03, 2.401e-02) * s0_6;
	r += V4(9.546e-02, -1.656e-02, 6.935e-04, 6.387e-01) * s0_7;
	r += V4(-2.302e-02, 5.209e-03, 5.835e-02, -6.361e-02) * s0_8;
	r += V4(-4.485e-04, -2.620e-04, 2.449e-02, -7.403e-04);
	return r;
}

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	min16float s0_0 = l0(-1.0, -1.0);
	min16float s0_1 = l0(0.0, -1.0);
	min16float s0_2 = l0(1.0, -1.0);
	min16float s0_3 = l0(-1.0, 0.0);
	min16float s0_4 = l0(0.0, 0.0);
	min16float s0_5 = l0(1.0, 0.0);
	min16float s0_6 = l0(-1.0, 1.0);
	min16float s0_7 = l0(0.0, 1.0);
	min16float s0_8 = l0(1.0, 1.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}

//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(7.103e-02, 1.495e-01, -1.731e-02, -9.952e-02, -1.539e-01, -1.103e-01, 7.099e-02, 2.023e-01, 2.681e-02, 5.202e-03, 1.954e-02, -6.822e-02, -1.650e-01, 3.710e-01, -6.020e-01, 4.879e-01));
	r += mul(s0_1, M4(-1.168e-02, 2.587e-01, -4.670e-01, -3.986e-02, -1.268e-01, 3.619e-02, 5.712e-02, 1.722e-01, 4.473e-02, -1.224e-01, 8.228e-02, -3.981e-02, 4.044e-01, -3.039e-01, -3.390e-01, 5.925e-02));
	r += mul(s0_2, M4(4.083e-02, 7.140e-02, -5.864e-01, 1.188e-01, 2.214e-01, -2.826e-01, 2.294e-01, -2.199e-01, -9.048e-02, 1.787e-01, -6.887e-02, -6.645e-02, -1.285e-01, -8.261e-02, -1.975e-01, 2.428e-01));
	r += mul(s0_3, M4(-5.801e-02, -3.381e-02, -2.285e-01, 9.377e-02, 1.878e-01, 9.285e-02, -1.001e-01, -5.059e-02, -2.155e-02, -9.098e-02, -1.279e-02, 9.801e-02, 1.178e-01, -1.967e-01, -4.792e-02, -1.106e-01));
	r += mul(s0_4, M4(3.048e-01, 2.731e-01, -2.351e-01, -1.516e-01, -1.382e-02, 1.296e-01, -9.530e-02, 2.975e-02, 2.411e-01, 2.343e-02, 1.731e-02, -2.331e-01, -2.161e-01, 4.114e-01, 4.417e-01, 1.225e+00));
	r += mul(s0_5, M4(3.337e-01, 2.844e-01, 1.065e-01, -2.391e-01, -1.265e-01, -3.625e-02, -7.062e-02, 3.529e-02, 2.208e-02, -8.459e-03, -1.366e-01, -1.563e-02, -1.648e-01, -5.919e-01, 4.061e-01, -4.975e-02));
	r += mul(s0_6, M4(6.213e-03, -2.020e-02, 2.520e-03, 2.167e-02, -2.361e-01, -1.421e-01, -4.579e-02, -1.353e-01, -2.883e-01, -5.900e-04, 2.720e-02, 1.591e-01, -5.120e-01, -4.253e-01, -3.397e-02, -4.633e-01));
	r += mul(s0_7, M4(2.456e-01, -6.978e-02, 5.668e-02, -9.795e-03, -1.925e-01, -4.841e-02, -1.273e-02, 1.282e-02, -1.223e-01, -4.080e-02, 2.975e-02, 1.595e-01, -3.345e-01, -1.504e-01, 1.080e-01, 8.549e-01));
	r += mul(s0_8, M4(8.700e-02, 1.611e-02, 8.589e-02, -3.284e-02, -1.637e-01, 2.627e-01, 1.851e-02, 2.843e-02, 1.224e-01, 6.163e-02, 4.991e-02, -1.510e-01, 1.885e-01, -5.951e-02, -3.463e-02, 2.172e-01));
	r += mul(s1_0, M4(1.856e-01, -1.041e-01, 1.900e-01, 8.420e-02, -3.223e-01, 6.258e-02, -9.766e-02, -6.517e-01, 3.066e-02, -7.562e-02, 1.015e-02, -1.139e-01, 1.569e-02, -3.684e-02, -2.813e-02, 8.835e-02));
	r += mul(s1_1, M4(-7.107e-02, -1.146e-01, 5.488e-01, -2.960e-01, 3.743e-01, -5.368e-01, -2.219e-01, -3.122e-01, 2.468e-02, -7.477e-01, 1.858e-01, 3.498e-01, 1.771e-03, 4.215e-03, 8.478e-02, 9.318e-02));
	r += mul(s1_2, M4(-2.350e-03, -3.382e-01, 5.964e-01, -2.321e-01, 2.011e-01, 1.890e-01, -2.062e-01, -3.725e-02, -1.003e-01, -1.464e-01, 1.040e-01, 9.994e-02, -7.113e-02, -3.827e-02, -1.258e-01, -1.584e-01));
	r += mul(s1_3, M4(-1.609e-01, -1.460e-01, -4.804e-03, 5.503e-02, 2.784e-01, -1.475e-02, 9.395e-02, -1.128e-01, 1.032e-02, -1.969e-01, 2.170e-01, 2.335e-01, -1.371e-01, 4.853e-02, 8.945e-03, -2.698e-01));
	r += mul(s1_4, M4(7.739e-02, -1.105e-01, 3.348e-01, 1.093e-01, -7.745e-02, -1.642e-01, -2.191e-01, -2.674e-02, 4.199e-01, -3.302e-01, 1.445e-01, -2.815e-01, -3.154e-01, 6.646e-02, 8.520e-02, -1.053e-01));
	r += mul(s1_5, M4(-4.165e-01, -8.545e-02, 2.291e-01, -1.042e-01, 3.791e-01, -7.209e-02, -6.332e-02, -3.174e-01, 1.038e-01, 8.122e-03, -9.715e-02, 6.808e-01, -9.362e-02, -4.634e-02, 5.184e-03, 1.295e-01));
	r += mul(s1_6, M4(-8.179e-02, -8.513e-02, 4.470e-02, -7.799e-02, -1.092e-01, -1.851e-01, -1.025e-01, -4.220e-02, -3.853e-01, 3.040e-02, -9.081e-02, 1.439e-01, -2.730e-02, -5.086e-02, 5.352e-03, -5.102e-03));
	r += mul(s1_7, M4(7.601e-02, -1.423e-01, 3.421e-01, 2.574e-03, 1.165e-01, 6.863e-03, 1.250e-02, -4.862e-02, -3.859e-01, -1.108e-01, 2.515e-02, 5.564e-01, 2.485e-01, 2.230e-01, -3.839e-02, 3.605e-02));
	r += mul(s1_8, M4(-9.424e-02, 1.248e-01, 1.980e-01, -1.671e-01, 1.098e-01, 6.555e-02, -7.194e-02, -1.626e-01, -1.439e-01, -2.086e-01, -1.925e-02, 1.520e-01, 2.139e-01, -7.764e-02, 6.469e-02, 7.875e-03));
	r += mul(s2_0, M4(4.572e-02, 3.661e-02, -3.845e-01, -1.383e-01, 1.729e-02, 1.780e-02, 3.664e-02, -6.961e-02, -9.001e-03, -1.853e-02, -6.735e-02, -1.864e-02, 1.695e-01, -1.420e-01, 2.679e-01, -1.525e-01));
	r += mul(s2_1, M4(9.967e-02, -2.869e-01, -2.251e-01, 8.470e-02, 3.178e-02, -9.701e-03, 9.260e-02, 4.087e-04, -8.081e-02, 1.341e-01, 5.882e-03, 1.043e-02, 8.559e-03, 6.534e-02, -4.619e-01, -3.010e-01));
	r += mul(s2_2, M4(-1.676e-02, -3.339e-01, 1.848e-01, -2.562e-01, -8.563e-02, 2.487e-02, 2.495e-01, 9.448e-02, 2.189e-02, -3.018e-02, 5.698e-02, 6.041e-02, -4.869e-02, -2.627e-02, 1.602e-01, 1.092e-01));
	r += mul(s2_3, M4(6.867e-02, -1.693e-01, -1.614e-01, -1.944e-01, 1.992e-01, 1.720e-01, 2.393e-01, 1.219e-02, 4.866e-02, -1.165e-01, -1.285e-01, 2.929e-01, 2.043e-01, -1.399e-02, 1.595e-02, -2.746e-01));
	r += mul(s2_4, M4(-4.477e-01, -5.696e-01, -1.760e-02, 1.362e-01, 1.472e-01, 3.113e-01, -2.419e-01, 8.650e-02, -8.358e-02, 1.081e-01, 3.881e-02, -1.400e-01, -2.071e-01, 3.977e-02, -3.149e-01, 2.525e-01));
	r += mul(s2_5, M4(5.496e-02, -9.963e-02, -1.227e-01, -1.892e-01, 4.361e-02, -3.776e-01, -6.576e-01, 2.628e-01, -8.215e-02, -8.123e-02, 2.248e-03, 1.261e-01, 1.193e-01, 2.608e-01, 2.567e-01, 8.120e-02));
	r += mul(s2_6, M4(-1.587e-01, -9.849e-02, 1.122e-01, -5.963e-02, -9.176e-02, 7.341e-03, 1.164e-03, -5.660e-02, 1.567e-01, -6.958e-02, -3.780e-02, 4.238e-04, -6.186e-02, 1.777e-01, 2.398e-01, 6.853e-03));
	r += mul(s2_7, M4(1.062e-01, -1.498e-01, 5.492e-02, 1.108e-01, -3.248e-01, -2.901e-01, -4.360e-01, 1.128e-01, 7.346e-02, 8.659e-02, 9.740e-02, -1.434e-01, 1.538e-01, 1.349e-01, 1.408e-01, -1.367e-01));
	r += mul(s2_8, M4(1.412e-01, -8.889e-02, 2.029e-02, -1.523e-01, 4.847e-01, -7.432e-01, -1.181e-01, 4.132e-01, 3.119e-02, -5.840e-02, -2.292e-02, -3.125e-02, 2.440e-02, 2.815e-02, 2.759e-01, -8.781e-02));
	r += mul(s3_0, M4(2.147e-02, 2.192e-01, 2.489e-01, -3.436e-02, 1.086e-02, -2.680e-02, -9.925e-02, 3.978e-02, 1.239e-01, 3.645e-02, 5.463e-01, 5.005e-01, 1.039e-01, -1.694e-01, -3.816e-02, 3.834e-01));
	r += mul(s3_1, M4(1.418e-01, 5.806e-02, 1.317e-01, 2.227e-01, 1.486e-02, -4.235e-03, -5.750e-02, -1.548e-01, -7.700e-01, 3.263e-01, -1.193e-02, 3.537e-01, -2.841e-01, 4.657e-01, -1.576e-01, -9.526e-02));
	r += mul(s3_2, M4(7.641e-02, 8.195e-01, 1.080e-01, 1.814e-01, -5.471e-02, 2.211e-02, -4.212e-02, -1.249e-02, 2.469e-02, 5.436e-01, 3.805e-01, -9.622e-02, -6.358e-02, -3.739e-01, -3.504e-01, -2.627e-01));
	r += mul(s3_3, M4(-9.359e-02, -1.830e-02, -7.015e-02, -7.774e-02, 2.286e-01, -6.321e-02, -5.124e-02, -2.799e-03, -5.063e-01, -1.835e-01, 3.716e-01, 1.130e+00, 3.259e-01, -2.045e-01, -1.792e-01, 4.892e-01));
	r += mul(s3_4, M4(-7.478e-01, -1.192e-01, 1.022e-01, 8.111e-01, 7.253e-02, 2.280e-01, -1.116e-01, -2.828e-01, -2.364e-01, -1.233e+00, -1.125e+00, 1.750e+00, -1.215e+00, 4.973e-02, 2.070e-01, 6.996e-01));
	r += mul(s3_5, M4(-4.115e-02, 3.613e-01, 2.694e-01, 4.126e-02, 7.046e-02, 6.242e-02, 9.300e-02, -1.965e-01, -3.211e-01, 8.504e-01, 2.518e-01, -5.622e-01, 5.663e-02, -1.139e-01, 1.150e-01, -1.954e-01));
	r += mul(s3_6, M4(-1.870e-01, -9.168e-02, -8.947e-02, 6.127e-03, 1.163e-02, 3.733e-04, -3.330e-01, 1.935e-01, 3.424e-01, 1.313e-01, -6.732e-01, 8.256e-02, 6.713e-02, 2.980e-02, -6.912e-02, 1.715e-01));
	r += mul(s3_7, M4(1.636e-01, 1.212e-01, 2.280e-02, 1.552e-01, -4.955e-01, 8.376e-01, 1.476e-01, 2.192e-01, 9.746e-01, -3.148e-01, 8.206e-01, -8.104e-01, -7.918e-02, -1.604e-01, 5.505e-02, 7.640e-02));
	r += mul(s3_8, M4(1.248e-01, 2.878e-01, -4.182e-02, -9.214e-02, -1.210e-01, 4.382e-01, 8.062e-02, -3.051e-01, -1.803e-01, -3.041e-01, 1.368e-01, -1.030e-01, 2.941e-02, -2.724e-01, 3.480e-02, 1.396e-02));
	r += V4(-3.046e-02, 3.515e-02, 4.880e-02, 4.740e-03);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(1.574e-01, -6.104e-03, -2.288e-01, 5.024e-03, -2.149e-03, -8.674e-02, 1.209e-01, 7.107e-02, 1.242e-01, 2.312e-03, -5.300e-02, -2.285e-01, -8.824e-02, 7.402e-02, -4.447e-01, 1.117e+00));
	r += mul(s0_1, M4(-5.617e-02, 3.613e-01, -4.666e-01, 1.795e-01, 1.718e-01, -1.005e-01, -2.593e-01, 4.103e-01, 2.477e-01, 1.883e-01, 3.928e-02, -3.635e-01, -7.353e-01, 3.209e-01, 2.171e-01, 3.924e-01));
	r += mul(s0_2, M4(-3.304e-01, 6.332e-01, -3.898e-01, 2.704e-01, 4.110e-02, -2.786e-01, -2.513e-01, 1.800e-01, 9.402e-03, -1.975e-01, -4.040e-02, -2.047e-01, -3.239e-01, -1.623e-01, 1.001e-01, 3.053e-02));
	r += mul(s0_3, M4(3.935e-01, 5.218e-02, 4.630e-02, 2.202e-02, -2.172e-01, -1.530e-02, -1.782e-01, -9.327e-02, -6.425e-02, -2.402e-02, -2.919e-02, -4.034e-02, 6.589e-01, -4.900e-02, 7.783e-02, 6.334e-01));
	r += mul(s0_4, M4(-5.475e-02, 1.543e-01, -1.597e-01, -2.500e-01, 4.990e-02, 5.780e-02, 1.162e-01, 1.140e-01, -2.980e-01, -2.524e-02, -2.103e-01, 4.297e-01, 4.528e-01, -3.098e-01, -1.415e-01, 7.565e-01));
	r += mul(s0_5, M4(-1.097e-01, 3.376e-01, -5.685e-01, 1.347e-01, 1.155e-01, -1.396e-01, -2.840e-01, -1.373e-01, 1.442e-01, 8.711e-02, 1.357e-01, -1.110e-01, 2.095e-01, -2.901e-01, -1.007e-01, -2.473e-01));
	r += mul(s0_6, M4(-7.405e-02, 9.320e-02, -5.870e-02, -2.569e-01, 6.017e-03, -8.078e-02, -3.798e-02, 2.334e-01, 1.440e-01, -1.852e-01, -6.627e-03, 3.514e-03, -1.499e-02, -6.237e-02, 3.665e-01, 3.270e-01));
	r += mul(s0_7, M4(2.443e-01, 8.076e-02, -2.143e-01, 1.120e-01, 1.187e-01, 1.317e-01, 1.811e-01, 1.918e-01, -2.164e-02, -1.829e-01, 2.105e-01, 3.085e-01, 3.155e-01, 2.801e-01, -6.834e-01, 2.861e-01));
	r += mul(s0_8, M4(9.974e-03, 9.704e-02, -2.363e-01, 1.829e-01, 1.844e-02, 9.298e-02, -5.319e-02, -5.899e-02, -2.154e-01, 2.555e-02, -8.374e-02, 1.254e-01, -2.736e-01, -4.065e-02, 4.838e-02, 3.338e-02));
	r += mul(s1_0, M4(-1.239e-02, -1.316e-01, 8.694e-02, -8.443e-02, -1.143e-01, -6.018e-02, -9.054e-02, 7.381e-02, 2.722e-01, 1.030e-01, -8.583e-02, -4.433e-01, -1.339e-01, 1.264e-01, 8.581e-02, -1.947e-01));
	r += mul(s1_1, M4(3.030e-01, -3.527e-02, 4.665e-01, -3.372e-02, -2.301e-02, 7.308e-01, 5.938e-01, -5.901e-01, 4.766e-01, 1.081e-01, 8.809e-02, 3.482e-01, -1.938e-01, -8.091e-02, 3.649e-02, 9.321e-02));
	r += mul(s1_2, M4(1.376e-01, -4.460e-01, 4.298e-01, -4.809e-02, -3.819e-01, 5.216e-01, 2.687e-01, 1.359e-01, 2.936e-01, 1.222e-02, 3.706e-01, 2.481e-01, -4.716e-02, -1.798e-02, 2.731e-02, -7.140e-02));
	r += mul(s1_3, M4(1.657e-01, -3.624e-02, 1.541e-01, -5.006e-03, -4.051e-01, -9.782e-02, 3.008e-02, 1.962e-01, -6.146e-02, 1.866e-03, -3.052e-01, -2.202e-01, 1.057e-01, -1.151e-01, -6.310e-02, 3.914e-01));
	r += mul(s1_4, M4(-2.629e-01, 1.029e-01, 1.812e-02, -2.950e-01, -1.191e-01, 2.580e-01, -4.833e-01, 1.095e-01, 2.309e-02, 4.519e-02, 1.086e-01, 5.362e-01, -1.349e-01, -1.278e-01, 7.109e-02, -1.992e-01));
	r += mul(s1_5, M4(-1.815e-01, 2.898e-01, 3.446e-01, -1.587e-01, -6.360e-02, 1.662e-01, 5.187e-01, 1.701e-01, -2.770e-02, -5.932e-01, 2.467e-01, 3.940e-01, 1.022e-01, 1.033e-01, -5.084e-02, -6.520e-02));
	r += mul(s1_6, M4(-1.494e-01, 3.180e-02, 9.864e-02, -3.409e-01, 1.397e-02, 9.932e-03, -2.110e-01, 2.636e-01, 1.353e-01, -8.495e-02, -2.680e-03, -2.287e-01, 1.136e-01, -1.047e-01, 2.910e-02, 9.922e-02));
	r += mul(s1_7, M4(1.533e-01, 4.819e-04, 1.735e-01, 2.027e-01, 1.316e-01, 1.029e-01, 1.446e-01, 1.737e-01, 4.855e-02, 4.781e-02, 2.025e-01, 1.587e-01, 1.661e-01, 7.134e-02, 5.853e-02, -1.530e-01));
	r += mul(s1_8, M4(-1.476e-01, -4.916e-02, 1.989e-01, 1.159e-01, 4.753e-02, 1.694e-01, 4.343e-02, -6.974e-03, 3.382e-02, 2.275e-01, 3.466e-01, -7.178e-03, -1.104e-01, 2.059e-03, -7.101e-02, 8.934e-02));
	r += mul(s2_0, M4(-3.467e-01, 8.471e-04, 1.580e-01, 2.685e-01, -2.680e-02, -6.444e-02, 8.843e-02, 5.232e-03, 2.576e-02, -3.756e-02, -7.913e-03, -3.871e-02, -5.374e-02, -6.060e-02, -7.688e-02, 6.738e-01));
	r += mul(s2_1, M4(-3.963e-01, 1.295e-01, 2.623e-01, 2.565e-01, -1.831e-01, -6.054e-02, 1.817e-01, -8.944e-02, 1.974e-01, -2.800e-04, -3.964e-02, 1.232e-01, -3.477e-01, 3.791e-01, 1.438e-01, -7.862e-02));
	r += mul(s2_2, M4(2.540e-02, 1.123e-01, 6.461e-01, -3.856e-03, 3.373e-02, -5.719e-02, 1.556e-01, -1.100e-01, -3.499e-02, 9.146e-02, -4.624e-02, 9.774e-02, -1.148e-01, -2.280e-01, 4.977e-01, -1.568e-01));
	r += mul(s2_3, M4(5.352e-02, -1.293e-01, -6.991e-03, 4.190e-01, -2.334e-03, -4.433e-02, -8.470e-02, 1.162e-01, -1.045e-01, -7.444e-02, 8.951e-02, -1.124e-01, 4.295e-01, 1.086e-01, 1.336e-01, 2.645e-01));
	r += mul(s2_4, M4(-4.062e-01, -6.781e-02, 4.629e-01, -4.931e-01, -1.875e-01, 1.958e-01, -4.560e-01, -2.286e-02, -2.066e-01, 1.151e-01, -5.924e-02, 1.350e-01, -1.752e-01, 2.244e-01, -3.564e-02, -6.129e-01));
	r += mul(s2_5, M4(6.644e-02, 4.611e-01, 9.200e-02, 6.845e-03, -1.628e-02, 8.352e-02, -1.119e-01, -4.386e-02, -5.822e-02, -4.769e-02, -3.224e-02, -1.235e-01, -3.296e-01, 5.835e-03, 2.231e-01, 5.535e-02));
	r += mul(s2_6, M4(-2.961e-02, -5.230e-02, 5.124e-02, 6.542e-02, 2.004e-01, 1.189e-01, -1.797e-01, -1.535e-02, 6.469e-02, 1.134e-01, -1.204e-04, -7.606e-02, 2.436e-02, -1.630e-02, 1.841e-01, -2.529e-01));
	r += mul(s2_7, M4(-1.147e-02, 3.246e-02, 7.626e-02, -1.013e-01, 1.075e-01, 5.871e-01, -5.227e-01, -3.076e-01, 1.609e-01, 5.768e-02, -1.912e-02, 5.898e-02, -7.530e-02, -1.307e-01, 5.828e-02, -1.456e-02));
	r += mul(s2_8, M4(-7.053e-02, 8.728e-02, 1.211e-01, 1.410e-01, -2.160e-01, 9.970e-02, -5.345e-01, 1.141e-01, 8.112e-04, -4.348e-02, 9.858e-02, 2.780e-02, -1.116e-01, -2.331e-01, 1.545e-01, 7.984e-02));
	r += mul(s3_0, M4(-5.412e-02, 6.012e-03, -2.395e-01, -1.209e-02, -5.734e-02, 3.058e-02, -7.202e-02, -7.514e-02, 7.241e-03, -1.702e-01, 1.020e+00, 2.997e-01, -2.173e-01, 4.518e-02, -2.703e-02, -4.087e-02));
	r += mul(s3_1, M4(-5.670e-02, -9.713e-03, -2.091e-01, -1.621e-01, -5.370e-03, -5.579e-02, 1.042e-01, 2.220e-02, 4.788e-01, -6.623e-01, 5.548e-01, 8.186e-01, 2.462e-01, -7.624e-01, -9.065e-02, -1.105e-02));
	r += mul(s3_2, M4(4.043e-02, -1.577e-01, -3.166e-01, -1.256e-01, -9.515e-02, -8.852e-02, -4.960e-02, 1.129e-01, 1.690e-01, 2.314e-01, -5.134e-01, 9.584e-02, -3.085e-02, 2.399e-01, -3.381e-01, -7.233e-02));
	r += mul(s3_3, M4(1.750e-01, -9.450e-02, -2.230e-01, 4.190e-01, 8.900e-02, 2.306e-02, 2.783e-01, -3.295e-01, 2.697e+00, 8.855e-02, 5.728e-01, -8.682e-01, 6.085e-02, 5.010e-02, 1.343e-01, 1.137e-01));
	r += mul(s3_4, M4(9.857e-02, 3.310e-01, -3.584e-01, -5.586e-01, 5.751e-01, -4.023e-01, 3.838e-01, 1.240e-01, -1.482e-01, -1.233e-01, -5.953e-01, 1.534e+00, 3.390e-01, -2.022e-02, 1.619e-01, -2.959e-01));
	r += mul(s3_5, M4(1.528e-01, 1.593e-01, -1.886e-01, 2.281e-02, 2.174e-01, -8.846e-01, 5.726e-02, 7.369e-03, -1.490e-01, 3.377e-01, -4.669e-02, 1.206e-01, -1.251e-01, 2.600e-01, -2.439e-01, 2.067e-01));
	r += mul(s3_6, M4(4.090e-02, -2.118e-02, -9.012e-02, -8.624e-03, 1.464e-01, 6.929e-02, 1.492e-01, -4.039e-01, 6.123e-01, 2.679e-01, -2.284e-01, -3.609e-01, -6.598e-02, 1.341e-01, -2.371e-02, -2.899e-01));
	r += mul(s3_7, M4(5.189e-02, -3.928e-02, 1.670e-01, -1.536e-01, 5.066e-01, -3.768e-01, 6.577e-01, 1.140e-01, -1.537e-01, -1.941e-01, -9.152e-02, -3.571e-02, 1.068e-01, 4.803e-02, -3.180e-01, 4.361e-02));
	r += mul(s3_8, M4(-8.453e-02, -1.454e-02, 3.613e-02, 8.974e-03, -1.258e-01, -5.842e-01, 3.264e-01, 2.910e-01, 1.306e-01, 4.552e-01, 4.524e-01, 1.065e-02, -1.792e-02, 1.875e-02, -2.206e-01, 2.028e-01));
	r += V4(3.015e-03, -4.690e-02, 3.573e-02, -1.486e-02);
	return r;
}

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1

#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(8.130e-02, -1.243e-01, -7.648e-02, -2.424e-01, -4.742e-02, -5.420e-02, 4.117e-02, 1.568e-01, -3.621e-02, 2.032e-01, 4.484e-02, 1.249e-02, -1.505e-01, 7.294e-02, 4.943e-02, -6.336e-02));
	r += mul(s0_1, M4(-1.474e-01, -3.366e-01, -5.670e-01, 4.113e-02, -1.260e-01, -1.539e-01, -5.421e-02, 1.779e-01, -1.072e-01, 1.209e-01, 4.423e-02, 2.454e-01, -5.430e-02, -1.442e-01, -1.501e-02, -4.731e-02));
	r += mul(s0_2, M4(6.444e-02, 1.509e-01, 1.452e-01, -2.840e-02, 9.365e-02, 2.016e-01, 1.002e-01, -3.226e-02, -1.186e-01, 1.535e-01, -1.652e-01, -1.104e-02, 4.170e-02, -4.404e-02, 1.189e-01, 1.007e-02));
	r += mul(s0_3, M4(-4.618e-02, 1.024e-01, -1.723e-01, -1.354e-01, 1.981e-01, -1.992e-01, 1.670e-01, 3.857e-01, -6.927e-03, 9.087e-02, 1.176e-01, 3.314e-01, 9.860e-02, 4.009e-04, 1.061e-01, -6.930e-02));
	r += mul(s0_4, M4(4.923e-01, -9.248e-02, 8.616e-03, 4.541e-02, -1.148e-01, 3.990e-03, -3.218e-02, 8.942e-02, 3.219e-02, -9.786e-02, 6.813e-02, 2.492e-01, -3.165e-01, 6.925e-02, -9.826e-02, 4.518e-01));
	r += mul(s0_5, M4(2.222e-02, 1.046e-01, -2.327e-02, -7.823e-02, 3.540e-01, 3.363e-01, -4.089e-02, 1.292e-02, -2.530e-01, 4.606e-01, -6.191e-02, -3.673e-02, -2.764e-01, -1.360e-01, -2.947e-03, 2.534e-02));
	r += mul(s0_6, M4(-2.067e-02, -1.566e-01, -8.968e-02, 1.386e-03, -8.841e-02, -1.077e-01, 1.646e-01, 1.987e-01, -3.098e-01, 2.764e-01, 1.935e-01, 1.847e-01, 1.116e-01, -1.514e-01, -5.175e-02, 8.710e-02));
	r += mul(s0_7, M4(1.266e-02, -2.119e-01, -1.610e-01, -6.512e-02, -1.679e-01, 2.247e-01, -5.854e-02, 1.200e-02, -1.406e-01, 4.393e-01, -8.517e-02, 3.281e-02, -1.177e-01, -1.861e-01, -3.241e-01, -2.918e-02));
	r += mul(s0_8, M4(-3.015e-02, -1.605e-01, -1.001e-01, 7.795e-03, -5.873e-02, -7.686e-02, -1.448e-01, -1.851e-02, -2.172e-01, 1.977e-01, -1.333e-01, -8.894e-02, -8.939e-03, 1.675e-01, -7.976e-03, 4.020e-02));
	r += mul(s1_0, M4(1.165e-01, -4.833e-02, 4.750e-02, -4.032e-02, -2.287e-02, -4.825e-02, 9.058e-02, 2.136e-01, 1.009e-01, -2.133e-02, 4.162e-02, -6.816e-02, -9.863e-02, -4.160e-03, -2.467e-02, -9.096e-02));
	r += mul(s1_1, M4(8.597e-02, -2.205e-01, 1.515e-01, -2.918e-02, -1.099e-01, -4.171e-02, 3.893e-04, -5.273e-03, -2.046e-02, -3.905e-03, 7.793e-04, 5.930e-02, 2.653e-02, -2.546e-01, -8.456e-02, -6.554e-02));
	r += mul(s1_2, M4(-1.058e-01, 3.302e-01, 1.812e-01, 6.427e-02, -4.601e-02, -1.589e-02, 4.405e-02, -1.366e-02, -5.996e-03, -5.402e-04, 3.237e-02, -5.725e-02, -7.486e-02, 1.358e-01, 4.739e-02, -2.432e-02));
	r += mul(s1_3, M4(3.333e-02, 5.179e-01, -1.939e-03, 7.798e-02, 2.011e-02, -2.959e-01, 1.135e-01, 3.122e-01, 8.651e-02, -2.708e-02, 7.183e-03, 4.554e-02, -3.342e-02, 9.136e-03, -7.067e-02, -1.867e-01));
	r += mul(s1_4, M4(6.231e-01, 9.512e-01, 3.523e-01, 3.744e-01, 2.388e-01, -2.827e-01, 9.968e-02, -5.306e-02, -4.498e-02, -2.222e-01, -5.865e-02, 2.967e-02, -3.029e-01, -2.137e-01, -5.363e-01, 8.872e-02));
	r += mul(s1_5, M4(-4.862e-02, 7.326e-01, 1.354e-01, 5.607e-02, 1.667e-01, -1.184e-01, -1.304e-01, 6.817e-02, 3.287e-02, 3.310e-01, 1.521e-01, -3.212e-02, -8.947e-02, 4.250e-02, -9.770e-02, -8.344e-02));
	r += mul(s1_6, M4(-9.242e-04, 4.835e-03, 1.322e-01, 3.745e-02, 9.613e-02, -8.310e-03, 4.718e-02, 2.763e-02, -1.616e-02, 6.167e-02, -3.382e-02, 3.624e-02, 1.213e-02, -2.014e-01, -2.776e-03, 4.360e-02));
	r += mul(s1_7, M4(-6.861e-02, 4.772e-02, -3.779e-02, 7.567e-02, -8.548e-02, -1.028e-02, 1.881e-02, 2.421e-03, 1.378e-01, 1.305e-01, 2.177e-02, -1.118e-03, 5.861e-02, -1.416e-01, -3.140e-01, -9.031e-02));
	r += mul(s1_8, M4(-4.147e-02, 1.546e-01, 5.650e-02, 4.098e-02, -1.460e-01, -5.779e-02, -1.959e-02, -2.318e-02, 3.538e-02, -5.044e-02, 3.304e-02, -3.517e-03, -1.176e-01, -3.185e-01, -1.738e-01, -4.349e-02));
	r += mul(s2_0, M4(-3.428e-03, 6.059e-02, 7.024e-02, 2.739e-02, 1.313e-02, -5.748e-02, 9.005e-03, -7.139e-03, 1.165e-01, -1.541e-01, 1.493e-01, 2.725e-01, 3.254e-02, -2.934e-02, 1.115e-02, -2.844e-02));
	r += mul(s2_1, M4(-8.601e-03, -3.177e-03, 1.878e-01, 1.106e-01, 1.951e-02, 8.194e-02, 4.971e-02, 5.805e-02, 2.515e-02, -2.529e-01, -2.250e-01, 3.498e-02, 7.183e-02, -8.617e-02, -8.616e-02, 1.623e-01));
	r += mul(s2_2, M4(-8.072e-02, -1.234e-01, 3.482e-02, -2.873e-02, -4.049e-02, 4.828e-03, 1.940e-02, 3.828e-02, -5.156e-03, 4.585e-03, 2.326e-02, 2.346e-02, -8.908e-02, -1.384e-03, -2.366e-02, 1.290e-02));
	r += mul(s2_3, M4(4.921e-02, 1.726e-01, 3.832e-02, -2.490e-01, -1.152e-01, -1.722e-01, -1.705e-01, 4.228e-01, -8.215e-02, -1.478e-02, 1.554e-01, 3.701e-01, -8.863e-02, 1.068e-01, 8.890e-03, 6.324e-02));
	r += mul(s2_4, M4(1.307e-01, 2.312e-01, -1.734e-01, 2.083e-02, -1.966e-01, -3.991e-01, -8.681e-02, 1.976e-03, -3.177e-01, 1.528e-01, -2.329e-01, 2.569e-01, -6.230e-03, 6.020e-02, 4.969e-02, -2.039e-01));
	r += mul(s2_5, M4(1.660e-01, 1.642e-02, 7.203e-02, -1.613e-01, 6.225e-02, 6.470e-02, 3.305e-03, 2.230e-02, -2.455e-02, 6.599e-02, -1.740e-01, 7.887e-02, 3.463e-03, 1.003e-01, -1.850e-01, 7.885e-02));
	r += mul(s2_6, M4(-2.170e-02, 1.372e-01, 7.445e-02, -9.419e-02, -1.851e-01, 4.957e-02, -2.454e-01, 5.879e-02, -5.800e-02, -1.122e-01, 7.445e-02, 1.190e-01, 2.695e-02, -5.701e-02, -5.166e-02, -5.058e-02));
	r += mul(s2_7, M4(5.390e-01, 1.674e-01, 1.213e-01, -1.147e-01, -6.939e-02, -1.218e-01, -2.891e-01, 2.682e-02, -2.636e-01, -1.104e-01, -1.556e-01, 3.774e-02, -4.121e-02, -2.431e-01, -1.248e-01, 1.275e-01));
	r += mul(s2_8, M4(1.053e-01, 2.238e-01, -1.104e-01, 5.372e-02, 6.179e-02, -2.431e-03, -4.843e-02, 3.820e-02, -7.539e-02, 7.898e-02, 7.562e-03, 1.596e-02, 7.298e-02, -1.553e-01, -3.545e-01, 1.990e-02));
	r += mul(s3_0, M4(8.232e-02, -6.815e-02, -7.421e-02, -3.191e-02, -1.592e-01, 2.814e-01, 5.009e-02, 3.669e-02, -5.908e-02, -5.445e-02, 4.873e-02, 1.538e-01, 1.065e-01, -2.194e-01, -2.612e-02, -2.297e-02));
	r += mul(s3_1, M4(1.431e-02, -7.835e-02, -2.790e-03, 9.305e-02, -2.975e-01, 1.527e-01, 1.888e-01, -1.279e-02, -1.938e-02, -1.022e-01, -2.197e-02, -2.919e-02, 2.192e-01, -8.056e-02, 1.328e-03, 3.478e-02));
	r += mul(s3_2, M4(4.920e-03, -6.286e-02, -7.779e-02, 1.075e-01, -1.092e-01, 2.909e-01, 3.056e-01, -9.017e-02, -3.625e-02, 1.079e-01, 1.107e-01, 6.613e-02, 1.696e-01, -1.852e-01, -1.253e-01, -9.675e-02));
	r += mul(s3_3, M4(-6.350e-02, 1.137e-01, -3.559e-02, -1.684e-01, -2.044e-01, -9.368e-02, 2.283e-01, 8.052e-01, 4.476e-03, -1.599e-01, 2.594e-02, 1.582e-01, -2.483e-02, 9.216e-02, 5.719e-02, 2.237e-01));
	r += mul(s3_4, M4(-1.694e-01, 1.597e-01, -3.311e-01, 1.880e-01, 2.614e-01, -2.584e-01, 5.296e-02, 9.726e-02, -3.932e-02, -7.518e-02, -1.749e-01, 1.604e-01, 1.008e-01, 2.920e-01, 5.358e-01, -6.383e-01));
	r += mul(s3_5, M4(-2.706e-01, -2.716e-01, -4.196e-01, 1.023e-01, 2.201e-01, -1.412e-01, 1.003e-01, -6.972e-02, 3.727e-02, -8.424e-02, -7.870e-02, 2.294e-02, 2.836e-01, -4.165e-01, -2.974e-01, -3.567e-02));
	r += mul(s3_6, M4(-3.434e-02, 6.420e-02, -8.729e-02, -8.600e-02, -2.041e-01, 1.646e-02, 9.025e-02, 1.724e-01, -4.951e-02, -3.894e-02, -7.985e-02, 1.580e-02, 2.554e-01, -3.100e-01, -2.769e-01, 8.336e-05));
	r += mul(s3_7, M4(-6.557e-02, 3.865e-02, -3.263e-02, 4.621e-02, -2.077e-01, 2.705e-02, -3.354e-01, 1.480e-01, 4.155e-02, -2.143e-01, -2.626e-01, 1.091e-02, 1.382e-01, -1.706e-01, -1.355e-01, -7.700e-02));
	r += mul(s3_8, M4(-2.004e-01, 4.575e-01, -1.812e-01, 6.102e-02, 3.469e-01, -6.634e-02, 1.302e-01, -9.621e-02, 4.023e-02, 1.048e-01, -9.194e-02, 5.130e-03, 4.272e-01, -5.971e-01, -2.025e-01, -1.364e-01));
	r += V4(3.575e-03, 3.041e-03, 1.241e-02, -2.230e-03);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(-1.437e-01, -9.784e-02, 2.649e-01, -8.638e-02, -1.746e-01, 2.031e-01, 1.203e-01, -8.812e-02, -2.317e-01, 2.311e-01, 3.171e-02, -3.619e-02, -7.798e-02, -2.507e-02, 1.902e-01, 5.780e-02));
	r += mul(s0_1, M4(2.504e-01, 1.577e-01, -5.397e-02, 4.599e-01, -1.392e-01, 2.560e-01, 1.018e-01, 7.968e-02, 2.247e-01, -2.962e-03, 1.421e-03, -1.201e-01, -3.622e-01, 1.378e-01, 1.392e-01, 1.641e-01));
	r += mul(s0_2, M4(-6.143e-02, -6.336e-02, 1.131e-01, 6.811e-02, -5.817e-02, 7.362e-02, 1.407e-01, 1.823e-02, 4.880e-01, -2.282e-01, -2.704e-01, -4.287e-01, -2.741e-01, 3.163e-02, 1.098e-01, 1.514e-01));
	r += mul(s0_3, M4(1.794e-01, 1.720e-01, -4.092e-01, 1.277e-01, -1.938e-01, 3.107e-01, 2.915e-01, 2.279e-01, 2.259e-01, 2.136e-01, 5.867e-02, 2.359e-01, -1.589e-01, 1.132e-01, 6.871e-02, 2.837e-01));
	r += mul(s0_4, M4(-3.070e-01, -4.494e-01, 5.817e-02, 5.153e-01, 5.215e-01, 5.410e-01, 1.286e-01, -5.596e-01, 4.287e-01, 1.821e-01, 1.542e-01, 3.755e-01, 3.820e-01, 2.953e-01, -2.768e-01, -6.977e-02));
	r += mul(s0_5, M4(-4.881e-02, 2.327e-02, 9.209e-02, -2.102e-02, -1.394e-01, -8.093e-03, 2.263e-01, -4.307e-01, 1.998e-01, -8.793e-02, -1.057e-01, -1.899e-01, 1.577e-01, 3.435e-01, 6.721e-02, 3.093e-01));
	r += mul(s0_6, M4(7.516e-02, -1.224e-01, 1.257e-02, -6.769e-02, -8.618e-02, 1.283e-01, 2.060e-01, -1.966e-01, 8.166e-02, -1.263e-01, -2.269e-01, -3.272e-01, -3.439e-02, -2.849e-01, 2.105e-01, -3.015e-03));
	r += mul(s0_7, M4(7.447e-02, -8.731e-02, 2.804e-02, -4.819e-02, -3.311e-01, 3.824e-01, 7.766e-02, 5.672e-02, 4.014e-01, -4.037e-03, 2.287e-01, 5.626e-02, 3.481e-01, -1.010e-01, -1.156e-01, -2.865e-01));
	r += mul(s0_8, M4(5.454e-02, -5.590e-02, 3.408e-02, 3.551e-03, 1.262e-02, 8.638e-02, 1.222e-01, 3.418e-01, -2.154e-01, 1.868e-01, 1.210e-01, -2.330e-01, -4.810e-02, -5.190e-02, -8.587e-02, -2.145e-01));
	r += mul(s1_0, M4(-3.063e-01, -1.830e-02, 5.167e-01, 4.813e-02, -7.310e-02, 1.443e-01, 1.654e-01, 1.158e-01, 4.789e-02, -3.030e-02, -1.358e-01, 2.986e-02, -4.855e-02, -7.736e-02, 4.514e-01, -1.797e-02));
	r += mul(s1_1, M4(4.322e-01, -1.369e-01, 9.431e-02, 3.921e-01, 2.708e-02, -1.218e-02, -9.091e-02, 1.871e-01, 3.763e-02, -9.213e-02, -1.209e-01, -1.587e-01, 3.014e-03, 1.816e-01, 3.099e-01, 3.210e-01));
	r += mul(s1_2, M4(-7.234e-02, 1.685e-02, 4.444e-01, -1.886e-01, -9.543e-03, 3.966e-02, 1.105e-01, 4.870e-02, 9.471e-02, -5.263e-02, -1.085e-01, 4.226e-02, -1.565e-01, -3.812e-02, 1.708e-01, 1.457e-01));
	r += mul(s1_3, M4(2.370e-01, -3.354e-02, -9.648e-02, 1.531e-01, -3.468e-01, -3.957e-02, 3.152e-01, 3.402e-02, 3.762e-02, 9.507e-02, 7.836e-02, 9.088e-03, -1.614e-01, 4.377e-02, 4.748e-02, 1.055e-01));
	r += mul(s1_4, M4(2.342e-01, -5.059e-01, 2.781e-01, 2.906e-01, 1.656e-01, 1.268e-01, 1.183e-01, -2.458e-02, 2.290e-01, 1.779e-01, -8.310e-02, 1.389e-01, 7.282e-02, 1.050e-01, -3.525e-01, 6.810e-02));
	r += mul(s1_5, M4(1.078e-01, -4.451e-02, 7.031e-02, -2.977e-01, 3.596e-02, 3.359e-02, 9.589e-03, 9.070e-02, -1.862e-01, -1.863e-01, -9.652e-02, -5.039e-02, 1.004e-01, 1.598e-01, 1.466e-01, 2.349e-01));
	r += mul(s1_6, M4(1.109e-02, -1.607e-01, 1.578e-02, -1.971e-01, 5.020e-02, -7.597e-02, 7.238e-02, 7.241e-02, 2.025e-02, -2.246e-02, 4.652e-02, -8.760e-02, -1.111e-02, 1.890e-02, 1.046e-01, -2.233e-03));
	r += mul(s1_7, M4(1.252e-01, -8.046e-02, -1.321e-01, -3.724e-01, -1.383e-01, 1.151e-01, 5.397e-02, -1.422e-01, 8.319e-02, 9.089e-02, -2.620e-02, 1.662e-01, 2.847e-02, -1.255e-01, 6.933e-02, -1.636e-01));
	r += mul(s1_8, M4(-1.517e-01, 3.661e-02, -3.135e-01, -3.395e-01, -1.139e-01, 1.973e-01, 8.547e-03, -3.118e-02, -8.869e-02, -1.209e-01, 1.867e-02, -4.531e-02, 1.016e-01, -6.909e-02, 1.436e-01, 1.663e-01));
	r += mul(s2_0, M4(-9.314e-02, 1.395e-02, -1.741e-02, -7.208e-02, -5.164e-02, -5.743e-02, 5.702e-02, 1.342e-01, 6.011e-03, 1.626e-01, 1.101e-01, -1.130e-01, 6.127e-02, -8.956e-03, -7.149e-02, -6.488e-03));
	r += mul(s2_1, M4(-2.534e-01, 1.086e-01, -1.007e-01, -3.067e-02, -1.074e-01, 7.219e-03, 6.768e-02, -1.012e-01, 2.019e-01, 4.263e-03, -7.411e-02, -1.173e-01, 1.961e-01, -5.619e-02, -2.390e-01, -1.323e-01));
	r += mul(s2_2, M4(-1.039e-01, -9.899e-02, -2.206e-01, -2.187e-01, -8.739e-03, 6.607e-02, 4.125e-02, 5.363e-02, -6.572e-03, 3.014e-02, 1.314e-01, -9.560e-02, 2.106e-01, 1.237e-02, -8.354e-02, -4.939e-03));
	r += mul(s2_3, M4(-4.682e-02, -1.357e-01, 3.481e-02, -2.187e-01, 1.113e-01, 8.812e-02, -1.211e-01, -2.011e-02, 1.567e-01, -2.216e-02, -4.920e-03, -2.458e-01, 2.263e-02, 6.741e-02, -1.234e-02, 2.338e-02));
	r += mul(s2_4, M4(5.105e-02, -3.845e-01, 1.812e-01, -1.927e-01, 2.840e-01, -2.094e-01, 5.673e-02, 4.405e-02, 5.957e-01, 1.734e-02, -1.158e-01, -6.956e-01, -2.077e-01, 5.130e-03, 4.744e-01, -1.540e-02));
	r += mul(s2_5, M4(1.601e-01, -2.680e-01, -1.678e-01, -1.207e-01, -4.648e-02, -6.454e-02, 1.122e-01, -6.567e-02, 1.638e-01, -1.259e-01, -2.470e-02, -3.547e-01, -1.333e-01, -1.219e-02, -7.710e-02, -3.881e-01));
	r += mul(s2_6, M4(-6.060e-02, 1.662e-01, -2.082e-01, 3.193e-01, -1.317e-01, 1.395e-04, 2.436e-01, -1.480e-01, 6.104e-03, -2.009e-01, -6.729e-02, -2.207e-01, -7.784e-02, -7.589e-02, 7.569e-02, 3.261e-03));
	r += mul(s2_7, M4(-2.951e-01, -2.050e-01, 2.827e-02, 3.739e-01, 1.947e-01, 5.411e-01, -2.262e-01, -8.808e-03, 2.262e-01, -9.010e-02, -1.476e-01, -3.582e-01, -1.718e-01, 2.844e-02, 7.832e-02, 1.414e-03));
	r += mul(s2_8, M4(3.534e-01, 1.695e-01, -1.247e-01, 4.750e-01, 4.171e-02, 2.338e-02, -4.525e-02, -4.955e-02, 2.934e-01, -3.865e-02, -1.125e-01, -2.127e-01, 1.326e-01, 5.967e-02, 6.215e-02, 1.048e-01));
	r += mul(s3_0, M4(4.186e-02, -5.378e-02, 7.641e-02, -3.524e-02, -2.447e-01, -5.374e-02, -1.380e-01, -4.221e-01, -3.797e-02, -7.623e-03, -4.826e-02, 1.791e-02, -1.390e-01, 1.115e-01, 2.252e-01, -9.103e-03));
	r += mul(s3_1, M4(1.339e-01, 3.093e-01, -3.615e-02, 8.684e-02, -4.098e-01, -1.216e-01, 2.372e-01, -1.247e-01, -5.358e-02, -1.660e-01, -8.435e-02, 3.871e-02, 2.722e-01, -1.145e-01, -3.944e-01, -5.003e-02));
	r += mul(s3_2, M4(-4.430e-02, -3.135e-02, 1.019e-01, -1.129e-01, -2.647e-01, -1.317e-01, 8.715e-02, -5.466e-02, -3.946e-02, 7.216e-02, 1.677e-01, 9.349e-02, 8.069e-02, -1.097e-01, -9.659e-03, -8.460e-02));
	r += mul(s3_3, M4(-5.036e-03, 4.992e-02, 1.086e-01, -1.339e-02, 2.792e-01, 3.294e-01, -1.578e-01, 4.592e-01, -7.749e-02, 4.384e-02, -4.212e-02, 2.287e-02, 1.456e-01, 4.774e-02, -1.264e-01, 7.437e-02));
	r += mul(s3_4, M4(3.022e-01, -2.197e-01, -4.347e-02, -2.198e-01, 3.922e-02, 8.609e-02, 8.862e-02, 3.418e-01, 8.117e-02, -2.026e-02, -3.236e-01, -2.539e-01, -6.030e-02, -2.409e-01, 7.879e-02, -8.457e-02));
	r += mul(s3_5, M4(3.525e-01, 2.622e-01, -4.994e-02, -1.932e-01, -1.508e-01, 1.229e-01, 1.359e-01, 1.613e-01, 1.830e-01, -4.473e-02, -5.438e-02, -1.041e-01, 4.534e-01, -4.660e-01, -7.405e-02, -1.001e-01));
	r += mul(s3_6, M4(-1.224e-02, -5.840e-03, 8.031e-02, -2.279e-02, -2.128e-01, 1.477e-01, -9.937e-03, 4.142e-02, -3.726e-02, -1.013e-01, -2.940e-03, -1.333e-01, 1.353e-01, -2.192e-01, -3.858e-01, -1.100e-01));
	r += mul(s3_7, M4(-8.882e-02, 1.341e-01, 2.707e-01, 2.212e-01, 2.628e-01, 3.454e-01, -3.703e-01, 4.902e-01, 1.527e-01, 8.567e-03, -1.742e-01, -1.884e-01, -7.710e-01, 1.028e-01, 3.233e-01, -3.897e-01));
	r += mul(s3_8, M4(3.715e-02, 2.936e-01, -1.195e-01, -1.295e-01, -1.313e-01, -1.222e-01, -2.876e-01, 5.694e-02, 6.813e-02, -1.738e-02, -1.154e-01, 1.649e-02, 1.755e-01, -1.639e-01, 3.212e-02, 3.504e-01));
	r += V4(-8.611e-03, -6.529e-03, -1.098e-03, 4.669e-03);
	return r;
}

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0, t1
//!OUT t2, t3

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(7.356e-02, 8.402e-03, 1.287e-01, 6.762e-02, 2.134e-01, -6.620e-02, -2.788e-01, -5.744e-02, -3.896e-02, -3.993e-02, -7.161e-02, -1.982e-01, -6.734e-02, 8.804e-03, -4.739e-02, 6.502e-02));
	r += mul(s0_1, M4(2.249e-01, 4.958e-02, 1.138e-01, 3.152e-01, 2.008e-01, 1.703e-01, 5.817e-02, -9.482e-02, -2.371e-01, 3.975e-02, -1.755e-01, -2.666e-01, 2.819e-01, -2.640e-02, 1.405e-01, -6.009e-02));
	r += mul(s0_2, M4(2.065e-01, -3.027e-02, -3.447e-02, 3.226e-03, -1.252e-02, -7.589e-03, 2.344e-03, -1.704e-02, -8.894e-02, 3.136e-02, -1.517e-01, -2.176e-02, 8.920e-02, -5.322e-02, -9.529e-02, 8.355e-02));
	r += mul(s0_3, M4(1.136e-01, 1.015e-01, -2.730e-02, -2.144e-01, -9.526e-02, -2.857e-01, 2.711e-01, -1.991e-01, 2.596e-01, 1.602e-01, -2.169e-01, -1.097e-01, 3.353e-02, 6.231e-02, 8.753e-03, 3.707e-01));
	r += mul(s0_4, M4(-1.945e-01, 3.081e-01, -2.270e-01, -5.963e-02, -1.666e-01, -3.408e-01, 1.161e-01, -6.384e-02, -6.823e-01, -4.014e-01, -6.276e-01, -1.672e-01, 2.986e-03, -1.351e-01, 1.668e-01, -3.133e-01));
	r += mul(s0_5, M4(4.802e-02, -4.275e-02, 1.978e-03, -7.602e-02, -4.082e-03, 5.572e-02, -3.341e-02, 9.101e-03, -1.038e-01, 1.622e-01, 2.334e-02, 1.768e-01, 9.416e-03, -2.287e-01, 1.048e-01, -2.926e-01));
	r += mul(s0_6, M4(5.333e-04, 3.089e-02, 2.721e-02, -3.601e-02, -5.081e-02, -1.152e-01, 6.752e-02, 1.701e-01, -2.951e-02, 2.450e-01, -1.684e-01, -4.702e-02, -1.580e-02, 1.200e-02, -1.266e-02, 4.937e-02));
	r += mul(s0_7, M4(-1.351e-02, -6.248e-02, -3.060e-03, 4.140e-02, -2.090e-01, -6.831e-01, -8.857e-02, 2.536e-01, -2.333e-02, 1.521e-01, -8.033e-02, 2.124e-01, -6.615e-02, 1.317e-01, 1.847e-01, -2.150e-01));
	r += mul(s0_8, M4(4.605e-02, 1.013e-01, 6.834e-03, -6.411e-02, -1.476e-02, -2.845e-01, -4.312e-02, -1.171e-02, 6.985e-02, -6.859e-02, -2.785e-02, -3.226e-02, 5.186e-02, 1.102e-01, -2.071e-02, -1.250e-01));
	r += mul(s1_0, M4(1.952e-01, -3.342e-02, -3.770e-02, -2.026e-01, 4.850e-02, -3.174e-02, -1.987e-01, -2.886e-02, -1.298e-01, 1.994e-02, 1.131e-01, 2.950e-02, -1.791e-02, -4.533e-02, 4.695e-02, -6.907e-02));
	r += mul(s1_1, M4(2.401e-01, 1.809e-01, -5.151e-02, -6.271e-02, -1.409e-01, 9.215e-03, 1.176e-01, 2.717e-02, 1.130e-01, -3.228e-02, -9.086e-02, -1.202e-03, 1.642e-03, -7.943e-03, 1.097e-01, 1.842e-01));
	r += mul(s1_2, M4(8.774e-02, -1.486e-02, -4.808e-02, 4.089e-02, 6.244e-02, -7.645e-02, 5.614e-02, -5.706e-02, -2.386e-02, 4.407e-02, -1.378e-01, -5.880e-02, 2.936e-02, 2.285e-02, -3.924e-02, 5.724e-02));
	r += mul(s1_3, M4(2.603e-01, -1.455e-01, 1.429e-01, -2.992e-02, -6.288e-02, -5.216e-02, -1.802e-01, 1.060e-01, -2.473e-02, -6.795e-03, 2.843e-02, 7.745e-02, -4.868e-03, -9.998e-02, -7.961e-02, 5.068e-02));
	r += mul(s1_4, M4(2.018e-01, -1.293e-01, -5.291e-02, -4.763e-02, 3.484e-02, -1.648e-01, 8.786e-02, -6.101e-02, -1.083e-01, 5.522e-02, -1.814e-01, -2.392e-01, 6.427e-02, -1.908e-02, 2.643e-01, 1.294e-01));
	r += mul(s1_5, M4(-7.897e-02, -5.967e-02, -2.620e-01, 1.274e-02, -2.583e-02, 5.654e-02, -7.639e-02, -7.534e-03, -5.812e-02, -7.887e-02, -3.738e-03, 7.664e-02, 1.753e-02, -2.842e-01, -3.237e-01, 2.077e-02));
	r += mul(s1_6, M4(6.558e-02, -9.890e-02, 1.849e-02, 3.242e-04, 1.021e-02, 1.234e-01, 1.224e-02, -4.322e-02, -2.778e-02, 3.860e-02, -5.257e-02, -1.466e-02, -1.001e-02, -1.291e-03, 1.724e-01, -9.167e-02));
	r += mul(s1_7, M4(-5.291e-02, -2.764e-01, -6.402e-02, 4.327e-02, 1.921e-02, -1.484e-01, 3.286e-02, 4.051e-02, 1.636e-02, 3.932e-01, -5.432e-02, 4.540e-02, 3.947e-02, -1.385e-01, -1.065e-01, 1.569e-01));
	r += mul(s1_8, M4(-1.729e-02, 8.177e-02, -4.479e-02, -1.275e-01, -3.302e-03, -1.265e-01, -2.922e-02, 3.720e-02, 1.560e-02, 5.266e-02, -1.572e-02, -4.840e-02, 3.991e-03, 1.003e-01, -1.423e-01, 7.414e-02));
	r += mul(s2_0, M4(-1.207e-02, -2.418e-02, -7.769e-03, -1.401e-01, 1.660e-01, -6.347e-03, -1.092e-02, -1.830e-02, -1.252e-01, -5.217e-02, 9.898e-03, 1.461e-02, 2.654e-02, 1.219e-02, -3.769e-02, 1.897e-02));
	r += mul(s2_1, M4(-3.650e-02, 1.317e-01, 1.299e-02, -5.512e-02, -1.287e-01, 2.438e-02, -1.609e-03, 1.759e-01, 1.824e-02, 6.477e-03, 2.905e-02, -8.644e-02, 7.496e-02, -9.920e-02, 1.147e-02, 1.889e-01));
	r += mul(s2_2, M4(-7.005e-03, -4.482e-02, -1.853e-02, 3.441e-02, 1.251e-01, -3.162e-02, -1.701e-01, -5.231e-02, -1.647e-01, 2.261e-02, 8.255e-02, -3.730e-02, 1.811e-01, -9.052e-02, 1.728e-02, 1.911e-02));
	r += mul(s2_3, M4(2.359e-02, -1.334e-01, 2.761e-02, -1.251e-01, 1.455e-01, 4.076e-02, -3.260e-02, -1.782e-01, -3.575e-02, 1.411e-02, 1.322e-01, -9.592e-02, 5.423e-02, 7.989e-03, -1.460e-01, 8.895e-02));
	r += mul(s2_4, M4(1.304e-01, 1.296e-01, -7.250e-02, -6.647e-02, 8.382e-02, 1.111e-01, 8.976e-02, -5.914e-02, -2.228e-01, -4.772e-02, -1.931e-03, 8.499e-02, 4.483e-01, 1.327e-01, 5.086e-02, -4.795e-01));
	r += mul(s2_5, M4(4.674e-02, 7.104e-02, -5.312e-02, -7.730e-02, 2.647e-03, 8.893e-03, -8.889e-02, -5.714e-02, -4.546e-02, -4.002e-02, -1.514e-01, -2.989e-02, -8.669e-02, -5.441e-03, 1.460e-02, -2.327e-02));
	r += mul(s2_6, M4(1.146e-01, -1.154e-01, 8.289e-03, 7.655e-02, -2.194e-02, -3.908e-02, -2.191e-02, 2.363e-03, 4.527e-02, -7.852e-02, -4.728e-02, 1.066e-01, 4.023e-02, -5.192e-02, -4.180e-02, -3.879e-02));
	r += mul(s2_7, M4(2.446e-01, -2.295e-01, -5.819e-02, -2.646e-02, 8.106e-02, -8.799e-02, -3.455e-02, 6.900e-02, 5.579e-02, -1.551e-01, 1.609e-01, 9.954e-02, -1.499e-01, 8.628e-02, 1.114e-01, 1.313e-02));
	r += mul(s2_8, M4(1.028e-02, 9.150e-02, -6.161e-02, 5.124e-03, 3.822e-02, 1.533e-02, 2.329e-02, -1.106e-01, -1.541e-03, -1.818e-01, -9.577e-02, -3.402e-02, 1.784e-02, -1.152e-01, 6.896e-02, -1.111e-01));
	r += mul(s3_0, M4(-7.349e-02, -4.782e-02, 3.080e-02, -1.668e-01, 9.572e-02, 5.307e-02, 5.573e-03, 6.483e-02, 1.104e-01, -5.707e-02, -8.579e-02, -1.754e-02, 1.038e-01, 1.706e-02, -1.185e-01, 5.863e-02));
	r += mul(s3_1, M4(-1.639e-01, -6.808e-03, 1.836e-02, -1.482e-01, 1.032e-01, 2.612e-02, -1.751e-01, -1.527e-01, 3.169e-03, 5.272e-02, 7.983e-02, 5.066e-02, 1.191e-01, 3.658e-02, 3.275e-02, -1.122e-01));
	r += mul(s3_2, M4(-8.279e-02, -1.068e-02, 3.848e-02, -8.857e-03, -3.783e-02, 9.934e-02, -7.181e-02, 2.801e-02, -1.524e-01, -7.166e-02, 1.038e-01, -9.840e-04, -7.254e-03, -3.252e-02, -1.435e-02, 6.052e-03));
	r += mul(s3_3, M4(-3.534e-02, -2.891e-02, 3.778e-01, -2.472e-01, -4.015e-02, -5.651e-02, 2.006e-01, 1.249e-02, -8.408e-02, -1.160e-02, 2.881e-01, -6.805e-03, 1.340e-02, -1.237e-01, -1.617e-01, 1.894e-02));
	r += mul(s3_4, M4(-1.512e-02, 3.232e-01, -1.441e-01, -3.778e-01, -1.475e-01, -2.644e-03, -3.149e-01, 3.225e-02, 1.227e-01, -3.620e-02, -1.175e-01, -3.857e-01, 4.834e-02, -1.567e-01, 1.632e-01, -1.292e-01));
	r += mul(s3_5, M4(-1.592e-01, 3.426e-02, -1.506e-01, 1.215e-01, 1.314e-01, -7.432e-02, -8.767e-02, 1.685e-01, 6.875e-02, 2.804e-01, -3.279e-02, -1.870e-01, 1.049e-01, -9.061e-02, 8.573e-02, -9.407e-02));
	r += mul(s3_6, M4(5.310e-02, -1.089e-01, -1.496e-01, 2.134e-01, 5.599e-02, -1.565e-01, -6.842e-02, -1.362e-02, 6.861e-02, -2.548e-02, -1.614e-01, -3.698e-02, -2.731e-02, 1.138e-02, 1.288e-02, -1.789e-02));
	r += mul(s3_7, M4(-7.967e-02, -2.461e-01, -2.139e-01, 3.193e-01, 1.377e-01, -1.213e-01, 8.415e-02, 1.224e-02, 1.192e-01, 1.785e-01, 1.978e-01, 1.008e-01, 3.016e-02, 9.868e-02, 3.118e-03, -3.294e-02));
	r += mul(s3_8, M4(1.121e-01, -4.625e-02, 3.331e-02, -7.687e-02, 5.520e-02, 6.326e-02, 1.369e-02, 1.850e-02, 4.062e-02, -1.561e-01, -8.640e-02, 1.105e-01, 8.446e-03, -1.746e-03, 4.572e-02, -1.015e-01));
	r += V4(-1.057e-02, -1.114e-02, 1.597e-04, 1.132e-02);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(2.399e-01, 1.190e-01, 9.941e-02, -5.908e-03, 2.176e-01, -3.861e-02, -4.997e-02, -3.036e-02, -6.079e-02, 2.294e-02, -1.260e-01, 6.001e-02, -7.690e-02, -4.805e-02, 6.117e-03, 4.358e-02));
	r += mul(s0_1, M4(-4.669e-02, -1.150e-01, 9.700e-03, 2.351e-02, 3.215e-01, -1.737e-03, 2.091e-01, -1.245e-01, -8.592e-02, 1.866e-01, 2.826e-01, -6.728e-01, 1.528e-01, 5.511e-02, -4.930e-02, -1.959e-02));
	r += mul(s0_2, M4(-2.182e-02, -4.512e-02, 6.864e-02, 8.299e-02, 8.483e-03, -4.855e-02, -1.500e-01, 1.325e-02, 6.098e-02, -1.867e-02, 1.276e-02, 1.721e-02, 5.918e-03, -1.130e-01, 7.066e-04, -1.824e-03));
	r += mul(s0_3, M4(4.594e-02, 1.518e-01, -2.067e-01, 1.546e-02, -1.548e-02, 1.126e-01, -4.502e-03, -2.014e-02, 2.417e-01, -1.530e-01, -1.095e-01, -4.966e-02, 2.291e-01, -4.598e-03, 2.836e-01, 5.562e-02));
	r += mul(s0_4, M4(5.432e-02, -3.003e-01, 7.389e-01, -1.497e-01, -2.439e-01, -3.298e-01, 4.081e-01, -2.105e-01, -4.267e-01, 3.913e-01, 5.470e-01, 5.594e-01, -1.221e-01, -5.444e-02, -4.180e-01, 1.515e-01));
	r += mul(s0_5, M4(2.205e-01, -5.813e-03, 7.451e-03, 8.130e-02, -3.312e-02, -9.387e-02, -9.824e-02, 4.493e-02, 8.187e-02, -2.042e-01, 1.644e-01, 1.562e-01, -8.427e-02, 2.057e-01, -1.668e-02, -2.356e-01));
	r += mul(s0_6, M4(1.150e-02, 1.442e-02, -1.973e-02, -4.599e-02, -9.680e-02, 3.962e-02, 1.731e-02, -2.402e-02, 3.936e-02, 6.512e-03, 2.103e-02, 2.025e-03, -1.308e-02, -5.259e-02, 5.631e-02, 3.037e-02));
	r += mul(s0_7, M4(-1.306e-02, -3.164e-02, 1.196e-01, 2.798e-02, -2.533e-01, -1.204e-01, 1.860e-01, 1.564e-01, -4.731e-02, -7.323e-02, 1.441e-03, -9.049e-02, -3.371e-02, -2.801e-04, 2.952e-02, -2.632e-02));
	r += mul(s0_8, M4(3.024e-02, -1.034e-02, -7.595e-02, -7.550e-02, 3.562e-02, -4.589e-02, -3.066e-02, 7.995e-02, -1.866e-02, 1.022e-01, -2.624e-02, -1.074e-01, 2.176e-02, 1.434e-01, -5.664e-02, -3.473e-02));
	r += mul(s1_0, M4(2.252e-01, 9.801e-02, -5.786e-02, -6.661e-02, 7.599e-02, -9.244e-02, 4.437e-02, -1.203e-01, -1.577e-01, -3.797e-02, -1.335e-02, 4.540e-02, -3.540e-03, -9.094e-03, -4.076e-02, -8.099e-02));
	r += mul(s1_1, M4(2.557e-01, -2.549e-01, 2.306e-01, -4.389e-02, -3.677e-02, 5.796e-02, 4.505e-02, -1.209e-01, -4.484e-02, 1.229e-01, -5.686e-02, 2.778e-02, 9.876e-02, -6.893e-04, 9.771e-02, 1.264e-01));
	r += mul(s1_2, M4(-5.324e-02, -9.632e-02, -1.092e-02, -1.426e-02, 3.082e-02, 9.196e-02, -1.381e-01, -1.013e-01, 7.758e-03, -3.290e-02, 1.630e-02, -4.979e-03, -7.297e-02, -7.534e-02, 2.040e-02, -1.983e-01));
	r += mul(s1_3, M4(1.951e-01, 3.566e-02, 4.220e-02, 8.086e-02, -5.114e-02, -5.626e-02, -6.912e-02, 1.462e-01, 2.268e-03, -2.592e-02, 3.527e-02, -3.832e-02, 4.756e-02, 1.234e-01, -5.494e-03, 4.695e-02));
	r += mul(s1_4, M4(4.147e-01, -2.431e-01, 2.372e-01, 2.574e-04, -4.485e-02, 5.014e-02, 3.928e-02, -2.817e-02, 3.512e-01, 2.983e-01, -1.260e-01, 4.326e-01, -2.366e-01, -6.912e-02, 2.259e-01, -4.534e-01));
	r += mul(s1_5, M4(1.323e-01, 5.260e-03, 2.693e-02, 1.841e-01, -1.105e-01, 6.002e-02, -1.233e-01, 1.012e-02, -9.410e-02, -1.260e-01, 1.264e-02, -3.910e-02, 3.656e-01, -1.103e-01, 5.059e-01, 4.280e-01));
	r += mul(s1_6, M4(-7.537e-02, -2.153e-02, -4.511e-02, -5.184e-02, -1.745e-02, -1.165e-02, 1.352e-02, -1.951e-02, -4.888e-02, 2.249e-02, -3.915e-02, -4.557e-03, -9.946e-03, -1.633e-04, -3.200e-02, -1.356e-02));
	r += mul(s1_7, M4(-1.509e-01, -2.227e-02, 1.640e-01, 2.693e-02, 4.846e-02, 3.303e-02, -5.390e-02, 3.607e-02, -2.818e-02, -7.170e-02, 3.311e-02, -9.203e-02, -1.946e-03, -8.577e-02, -2.925e-02, 1.238e-01));
	r += mul(s1_8, M4(-3.295e-02, 1.995e-02, -1.689e-01, -4.353e-02, -4.138e-02, -7.439e-03, -2.343e-02, 6.997e-02, 8.031e-02, 1.117e-01, 4.894e-02, -6.214e-02, -1.960e-01, -1.630e-01, 8.586e-02, -8.213e-02));
	r += mul(s2_0, M4(-9.883e-02, -1.168e-02, -1.110e-01, -2.148e-01, 1.452e-01, 3.417e-03, -4.513e-02, 8.845e-02, -7.791e-02, 2.326e-02, -4.188e-02, -3.659e-02, 3.105e-02, -1.318e-02, -4.552e-03, 7.109e-02));
	r += mul(s2_1, M4(1.958e-02, -6.995e-02, 2.588e-01, -6.431e-02, -2.211e-01, 5.281e-02, 5.399e-02, 8.884e-02, -5.135e-02, -4.768e-02, 1.363e-01, -2.064e-01, -1.391e-01, 1.106e-01, -2.611e-01, 2.038e-01));
	r += mul(s2_2, M4(-6.883e-02, -1.360e-03, -1.628e-01, 7.301e-02, 1.213e-01, -5.159e-03, 1.194e-01, -1.148e-02, -1.285e-01, -1.448e-01, 1.776e-02, -1.414e-01, -3.022e-02, 1.382e-01, 6.695e-02, -4.201e-02));
	r += mul(s2_3, M4(-1.194e-01, 1.524e-03, -1.945e-01, -1.496e-01, 1.413e-03, -8.697e-04, -1.542e-01, -1.798e-03, -4.991e-02, -7.944e-03, -1.094e-01, -5.578e-02, 1.526e-01, -6.170e-02, 1.598e-01, 1.306e-01));
	r += mul(s2_4, M4(3.583e-02, -1.213e-01, 2.087e-01, -4.616e-02, 2.125e-01, -1.242e-01, 2.776e-01, -8.100e-02, -1.733e-01, 1.016e-01, 2.949e-01, 1.489e-01, 5.059e-01, 3.526e-01, -4.764e-01, -1.105e-02));
	r += mul(s2_5, M4(7.240e-02, 1.034e-01, -1.103e-01, 2.351e-02, -2.711e-02, 1.506e-02, -1.534e-01, 1.093e-01, 5.065e-02, -2.686e-01, 1.423e-01, -4.993e-02, 7.167e-02, 1.084e-01, -8.139e-03, 4.460e-02));
	r += mul(s2_6, M4(1.243e-01, 1.281e-02, 7.048e-02, 1.117e-01, -1.145e-01, -1.703e-02, -1.470e-02, -3.647e-02, 3.796e-03, 2.441e-02, -8.422e-02, 1.955e-02, -2.861e-02, -6.963e-02, 6.894e-02, -4.071e-02));
	r += mul(s2_7, M4(2.315e-01, 7.446e-02, -7.632e-02, 1.319e-01, -2.392e-02, 2.525e-02, 4.687e-02, 7.645e-02, 4.250e-02, -4.733e-02, 2.179e-01, -3.843e-02, -3.526e-01, 9.675e-02, -1.837e-01, -1.563e-01));
	r += mul(s2_8, M4(5.933e-02, 1.490e-01, -5.844e-02, 9.363e-02, 7.616e-04, -1.075e-02, -1.365e-01, -6.094e-02, 7.094e-03, -1.218e-01, 7.021e-02, 3.101e-02, -4.184e-02, 3.989e-02, -7.167e-02, -1.179e-01));
	r += mul(s3_0, M4(-7.835e-02, 6.392e-02, -5.802e-02, -1.483e-01, 1.374e-01, 3.699e-02, 2.043e-03, 1.554e-01, -6.873e-02, -1.174e-02, -1.518e-01, -1.405e-02, 4.783e-03, -1.131e-01, 4.121e-02, -8.849e-02));
	r += mul(s3_1, M4(-1.463e-01, 5.240e-02, -1.651e-02, -2.410e-01, 1.092e-01, -3.146e-02, -1.629e-02, -2.974e-02, -7.838e-02, -7.374e-03, 2.745e-01, -1.408e-01, 1.335e-01, 8.634e-02, 1.073e-02, -1.407e-02));
	r += mul(s3_2, M4(-7.340e-02, 2.321e-02, 1.922e-02, -1.112e-01, 2.932e-02, -2.587e-02, 1.333e-01, 4.721e-02, -1.514e-01, -3.395e-02, -1.264e-01, 1.777e-02, -8.692e-02, 1.186e-02, -7.424e-02, -2.402e-02));
	r += mul(s3_3, M4(5.052e-02, 2.790e-03, 3.121e-02, -1.839e-01, 3.910e-02, 2.279e-02, 6.041e-02, -8.205e-03, -5.819e-02, 5.701e-04, 5.763e-02, -1.835e-02, -7.273e-02, -1.017e-01, -4.708e-02, 3.331e-02));
	r += mul(s3_4, M4(-4.521e-02, -3.700e-02, -1.199e-01, -3.863e-01, -4.641e-01, -2.451e-01, 1.512e-03, -3.424e-01, -1.194e-01, 1.119e-01, -1.183e-01, 1.918e-01, 8.865e-02, 1.866e-01, -4.503e-02, 2.355e-03));
	r += mul(s3_5, M4(5.461e-02, -1.461e-01, 2.827e-01, 2.041e-01, -8.786e-03, 1.079e-02, 1.593e-01, 2.173e-01, 4.916e-01, -1.773e-01, 2.149e-02, -1.461e-01, -5.435e-02, 1.909e-01, -2.171e-01, -7.547e-02));
	r += mul(s3_6, M4(2.543e-02, 5.455e-02, -6.107e-02, 5.194e-03, 9.984e-02, 8.664e-02, -7.757e-04, 3.957e-02, 1.432e-01, 3.805e-02, -1.005e-03, 7.600e-02, -4.304e-02, -6.326e-02, 3.996e-02, 3.872e-03));
	r += mul(s3_7, M4(-1.234e-01, -1.276e-01, 1.312e-01, 8.454e-02, 1.539e-01, 6.822e-02, -1.455e-02, 1.223e-01, -1.060e-01, -3.708e-02, -1.480e-01, -7.922e-02, 6.503e-02, 1.105e-01, -1.249e-01, -3.210e-02));
	r += mul(s3_8, M4(1.676e-01, -7.072e-03, 4.581e-02, -1.006e-01, -1.056e-02, -8.209e-02, -4.804e-02, 2.427e-02, -1.165e-01, -8.224e-02, 2.940e-01, -9.220e-03, 5.420e-02, 1.802e-01, -1.190e-01, 1.433e-02));
	r += V4(-4.712e-03, -1.187e-02, 1.287e-02, -6.625e-03);
	return r;
}

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t2[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t3[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t2, t3
//!OUT t0, t1

#define l0(x, y) V4(O(t2, float2(x, y)))
#define l1(x, y) V4(O(t3, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(1.060e-02, 9.173e-03, 9.548e-04, -7.886e-02, -1.324e-02, 4.660e-02, -4.997e-02, -5.676e-02, -3.290e-02, 6.253e-02, -5.777e-02, 1.265e-02, 6.136e-03, 7.179e-02, 3.102e-02, 4.961e-02));
	r += mul(s0_1, M4(5.787e-03, -2.090e-03, -1.489e-01, 4.380e-02, 1.259e-01, 5.508e-01, 1.211e-01, 3.385e-01, 2.399e-02, -1.436e-01, 2.987e-03, -2.839e-02, -3.021e-02, -8.641e-03, 1.716e-01, -1.328e-02));
	r += mul(s0_2, M4(-3.284e-02, -5.196e-02, -2.983e-02, -2.858e-02, 1.729e-02, 7.665e-02, 1.387e-01, 1.037e-01, 4.289e-02, 1.274e-01, 3.348e-02, 1.911e-02, -1.786e-02, -4.888e-02, 6.323e-02, -2.989e-02));
	r += mul(s0_3, M4(2.473e-02, -6.550e-02, -1.373e-01, 3.680e-02, 1.575e-01, -8.270e-02, 3.186e-02, -3.836e-02, 4.508e-02, 4.254e-02, 5.656e-03, -9.132e-02, 1.334e-01, -5.076e-02, -2.445e-02, -4.735e-02));
	r += mul(s0_4, M4(-5.346e-01, 1.950e-01, 2.121e-01, -3.694e-01, 5.004e-02, 1.610e-02, 2.249e-01, -5.962e-02, -6.243e-02, -3.270e-01, 1.851e-01, 4.051e-02, -2.310e-01, -2.300e-01, -1.314e-01, 3.374e-01));
	r += mul(s0_5, M4(-4.686e-02, -3.968e-01, 2.772e-02, 2.495e-02, 4.541e-02, 8.724e-02, 4.401e-02, -1.515e-02, -6.453e-02, -7.210e-02, -1.250e-02, 4.044e-02, 3.057e-02, 2.485e-01, 2.228e-02, 6.774e-02));
	r += mul(s0_6, M4(-1.518e-01, -6.862e-02, -8.148e-02, -2.030e-01, -4.453e-02, -2.133e-03, -6.081e-02, -8.941e-02, -5.417e-02, 1.564e-02, -5.425e-02, 5.875e-02, -8.805e-02, -1.910e-02, 2.099e-02, -1.402e-02));
	r += mul(s0_7, M4(-1.730e-02, -6.152e-02, -2.764e-01, -8.728e-02, 9.519e-03, -2.799e-02, -5.662e-02, 3.249e-02, 8.716e-02, 2.809e-02, -7.241e-02, 3.046e-02, 1.368e-01, 2.723e-02, 1.130e-01, -4.615e-02));
	r += mul(s0_8, M4(-5.021e-02, -3.352e-02, 5.072e-02, -1.434e-02, 6.511e-02, 6.519e-02, -8.987e-02, 2.193e-02, 1.583e-04, 2.714e-02, -2.315e-02, -3.077e-02, 7.792e-03, 2.782e-02, 9.282e-02, 5.011e-02));
	r += mul(s1_0, M4(-2.541e-02, -9.530e-03, -2.089e-01, -2.421e-02, 1.340e-02, 1.228e-01, 8.861e-02, -1.063e-02, -7.461e-02, 5.226e-02, -7.276e-02, 3.544e-02, -1.591e-02, 1.851e-02, 9.562e-03, 4.559e-02));
	r += mul(s1_1, M4(2.747e-02, -7.982e-02, -1.475e-01, 4.885e-02, -1.175e-02, -9.209e-02, -9.273e-02, -7.428e-02, 3.696e-02, -2.012e-01, 4.627e-02, 3.609e-02, 1.096e-01, -5.087e-02, 2.170e-01, 5.311e-02));
	r += mul(s1_2, M4(2.410e-02, 6.970e-02, 2.315e-02, 2.908e-02, 2.961e-05, 1.661e-02, 8.374e-02, 5.064e-02, 2.637e-02, 1.330e-01, 5.175e-02, -5.518e-02, -4.871e-03, 1.162e-01, 8.451e-02, 1.741e-02));
	r += mul(s1_3, M4(4.863e-02, -7.095e-02, 3.927e-03, -9.085e-02, 2.639e-02, -8.297e-02, -1.865e-01, -9.647e-02, 6.967e-02, 1.376e-02, 1.222e-01, -2.819e-01, 1.563e-01, -1.399e-02, -4.367e-02, -5.187e-02));
	r += mul(s1_4, M4(9.322e-02, 9.848e-02, 1.680e-01, -2.298e-01, -6.183e-02, -4.167e-02, -1.103e-02, -9.856e-03, -2.983e-03, -3.805e-01, -3.115e-01, -4.107e-01, -1.341e-01, -3.703e-01, -3.661e-01, -4.633e-01));
	r += mul(s1_5, M4(-2.785e-03, -2.188e-02, -2.790e-03, -4.276e-04, 7.082e-02, 1.004e-01, -3.532e-03, 1.740e-03, 6.693e-03, -5.230e-01, 2.119e-01, 2.878e-02, 3.915e-03, 1.842e-01, -1.630e-02, -3.874e-02));
	r += mul(s1_6, M4(2.313e-02, -6.545e-02, 1.631e-02, -1.278e-01, -4.216e-02, -4.147e-02, 6.827e-02, -1.725e-02, -5.254e-02, -3.942e-02, -2.400e-02, -8.124e-02, -3.250e-02, -1.806e-03, -3.947e-02, -7.056e-02));
	r += mul(s1_7, M4(8.445e-03, 1.147e-01, -7.772e-02, 1.091e-01, 1.842e-02, -6.040e-03, -7.053e-02, 1.824e-02, 2.212e-01, -8.777e-02, -1.003e-01, 6.533e-03, 2.090e-01, 4.588e-02, 9.886e-02, 6.176e-02));
	r += mul(s1_8, M4(4.046e-02, 1.872e-02, -5.723e-02, -4.997e-02, 5.232e-03, 1.795e-02, -2.747e-02, -1.507e-02, -1.704e-01, 7.849e-02, -1.475e-01, -4.255e-02, 7.807e-02, 4.185e-02, 3.849e-02, 3.137e-02));
	r += mul(s2_0, M4(-8.062e-03, 6.677e-02, 6.217e-02, 1.833e-01, -1.475e-01, 2.782e-01, 3.524e-02, -6.275e-02, 4.315e-02, 1.484e-02, 3.820e-02, -3.304e-02, 1.659e-03, -9.567e-03, -3.360e-02, -2.623e-02));
	r += mul(s2_1, M4(9.928e-02, -2.526e-01, -2.613e-02, 2.043e-01, 1.710e-02, -1.137e-01, 1.798e-01, -1.427e-01, 4.676e-03, 1.728e-01, 8.082e-02, -5.413e-02, -1.710e-02, -3.169e-02, -6.860e-02, 1.496e-02));
	r += mul(s2_2, M4(1.785e-02, 1.092e-01, -7.685e-02, 7.691e-02, 5.271e-03, -5.168e-02, 3.395e-02, 1.726e-02, 2.936e-02, -1.321e-02, 5.364e-02, -6.785e-03, 2.429e-02, -4.442e-02, -6.348e-02, 3.035e-02));
	r += mul(s2_3, M4(2.676e-01, 4.022e-03, -5.435e-02, -2.723e-01, -1.412e-01, -6.091e-01, 1.576e-02, 6.829e-02, -1.410e-01, 5.578e-03, 3.833e-03, 1.863e-01, -2.274e-02, 6.034e-03, 1.518e-01, -5.434e-02));
	r += mul(s2_4, M4(7.884e-02, 5.377e-01, -4.655e-02, -3.752e-01, 1.490e-01, -4.235e-02, -5.390e-02, 2.610e-01, 1.979e-01, -5.718e-02, 1.773e-02, 5.727e-02, 1.703e-02, 7.533e-01, -3.023e-02, 5.456e-02));
	r += mul(s2_5, M4(-4.898e-02, 4.237e-02, 6.311e-02, -4.635e-02, 3.660e-03, 2.139e-01, -3.722e-02, -6.738e-02, -3.009e-02, -6.140e-02, 2.777e-02, 3.917e-02, -1.421e-01, -4.041e-01, -1.524e-01, -9.837e-02));
	r += mul(s2_6, M4(6.071e-02, 1.084e-01, -6.370e-02, 1.323e-01, -7.251e-02, -1.079e-01, 1.208e-01, -4.495e-02, -2.115e-03, -4.107e-02, 2.465e-02, -1.230e-01, -6.064e-02, -4.263e-02, -1.388e-01, 6.519e-02));
	r += mul(s2_7, M4(9.042e-02, -8.032e-02, 1.186e-01, -1.537e-02, -6.566e-03, -3.216e-02, 3.412e-02, -3.207e-02, -1.586e-01, -2.988e-03, -2.358e-03, 2.172e-02, 6.775e-02, -3.590e-01, -4.123e-01, -3.506e-01));
	r += mul(s2_8, M4(8.486e-02, 4.731e-02, 5.779e-02, 1.000e-01, 9.121e-03, -3.421e-02, 4.891e-02, 4.916e-02, 3.343e-03, 4.437e-03, -2.002e-02, -3.856e-02, -1.319e-01, -4.022e-02, -1.752e-01, -9.250e-02));
	r += mul(s3_0, M4(-6.652e-03, 6.416e-02, 9.292e-03, 6.520e-02, 1.213e-02, 4.177e-02, 7.038e-02, -3.160e-02, 2.146e-02, -9.523e-02, -1.436e-01, -8.325e-02, -1.234e-02, -1.222e-02, -3.877e-02, -4.175e-02));
	r += mul(s3_1, M4(-5.171e-02, 1.011e-01, 7.998e-02, -8.804e-02, 1.067e-02, 1.516e-01, 6.508e-02, -7.724e-02, 2.717e-02, -4.901e-02, -6.059e-03, 4.013e-02, -3.833e-02, 1.538e-01, 5.948e-02, -4.945e-02));
	r += mul(s3_2, M4(1.023e-02, -1.230e-01, -1.861e-02, -2.570e-02, 2.512e-02, -4.630e-02, 6.354e-02, 3.897e-02, -2.146e-02, 2.446e-01, 1.906e-03, -9.068e-03, 1.754e-02, -7.082e-02, 1.107e-04, -9.604e-03));
	r += mul(s3_3, M4(7.380e-02, 2.216e-02, -2.608e-02, -6.491e-02, 4.018e-02, -6.657e-02, 1.116e-01, 9.405e-02, -7.168e-02, -3.646e-01, 8.387e-02, 1.352e-02, -4.589e-02, 2.235e-02, 1.881e-01, 1.759e-01));
	r += mul(s3_4, M4(-7.735e-02, -8.574e-02, -6.380e-02, 1.221e-01, 5.556e-02, -1.281e-01, 1.461e-01, 2.757e-01, 8.144e-01, -1.075e-01, 3.165e-03, -2.036e-01, 1.814e-01, 1.744e-01, -1.745e-01, 3.724e-02));
	r += mul(s3_5, M4(-6.864e-02, 1.273e-02, 7.502e-02, 4.164e-02, 1.301e-02, 1.407e-01, -9.985e-02, -8.079e-02, 1.428e-01, 3.034e-01, -1.564e-02, 6.091e-02, -1.271e-02, -2.153e-01, -7.843e-02, -4.063e-02));
	r += mul(s3_6, M4(-5.115e-02, 6.016e-02, -2.719e-02, 4.668e-02, -3.214e-02, -2.274e-02, -6.954e-03, -9.099e-03, 4.861e-02, 1.007e-01, -2.150e-01, -1.607e-01, -3.578e-02, 1.230e-02, -5.095e-02, 1.622e-02));
	r += mul(s3_7, M4(9.498e-02, -6.763e-02, 1.451e-01, 3.408e-03, -3.253e-02, 1.145e-01, 8.122e-03, -9.192e-02, 5.071e-02, -6.317e-02, 1.097e-01, 5.913e-02, 8.494e-02, 2.731e-04, -3.736e-01, -6.110e-03));
	r += mul(s3_8, M4(1.881e-02, 1.750e-02, 5.956e-02, 4.179e-02, -4.554e-02, -9.824e-02, 8.917e-03, 3.348e-02, 4.160e-02, 6.525e-02, 1.484e-02, -2.331e-02, -8.092e-02, -2.834e-02, -1.284e-01, -7.521e-02));
	r += V4(-5.270e-03, 1.390e-02, 8.622e-03, 1.255e-02);
	return r;
}

V4 f1(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(-4.868e-02, -7.333e-02, -1.029e-02, -7.011e-04, 2.404e-02, -9.301e-02, 1.457e-01, 2.242e-02, 6.850e-02, -1.328e-03, -2.557e-02, -4.854e-04, 1.071e-01, 3.788e-04, 1.408e-01, 5.354e-03));
	r += mul(s0_1, M4(7.892e-03, 5.832e-02, -1.077e-01, -6.140e-02, -1.003e-02, -4.887e-01, 8.263e-01, 2.416e-01, -3.434e-02, 1.089e-02, -1.984e-02, -3.615e-02, -5.692e-03, 1.615e-02, -5.680e-02, 2.041e-02));
	r += mul(s0_2, M4(2.136e-02, -2.731e-02, -1.742e-02, 2.592e-02, -4.319e-02, 6.426e-03, 2.110e-02, -6.338e-02, -6.921e-03, -1.288e-03, 4.579e-02, -2.155e-03, 3.041e-02, 1.946e-02, 1.238e-02, 6.906e-02));
	r += mul(s0_3, M4(-2.058e-02, 3.187e-02, -1.057e-01, 2.407e-01, -3.813e-02, -2.640e-02, 3.941e-02, 1.362e-01, 2.406e-02, 1.518e-02, -4.224e-02, 3.455e-02, 5.443e-02, -6.617e-02, -8.858e-02, -1.949e-02));
	r += mul(s0_4, M4(3.205e-01, -6.490e-01, -3.962e-01, -1.142e-01, -3.091e-02, 4.755e-01, -2.822e-01, -1.328e-01, -5.487e-01, 5.932e-02, -2.439e-02, -1.689e-01, -3.681e-02, -8.227e-02, 3.967e-02, -8.989e-02));
	r += mul(s0_5, M4(-5.668e-02, 3.658e-02, 1.227e-02, 8.117e-02, 1.161e-01, 9.350e-02, 9.971e-02, -1.220e-01, 7.876e-02, 5.186e-02, -4.261e-02, 1.436e-01, -2.114e-02, 6.113e-02, 2.251e-02, 2.534e-02));
	r += mul(s0_6, M4(-5.365e-02, -2.678e-02, -2.565e-02, 7.923e-02, -2.138e-02, -4.932e-02, -6.107e-03, 1.685e-02, 5.425e-02, -1.012e-02, -9.037e-03, 8.218e-04, -1.210e-02, 5.623e-02, -2.094e-02, -2.325e-02));
	r += mul(s0_7, M4(2.031e-02, -3.187e-02, 8.229e-02, 1.457e-01, 1.044e-01, -4.475e-02, 2.858e-02, -7.345e-02, -3.919e-02, -5.753e-02, 1.684e-02, -1.669e-01, 9.680e-03, 1.254e-01, 2.022e-03, -9.900e-02));
	r += mul(s0_8, M4(-1.164e-02, 5.171e-02, -5.704e-02, -1.643e-01, 2.554e-02, -9.988e-02, 3.699e-02, -3.752e-02, -8.076e-04, -2.527e-02, -2.081e-02, 3.110e-02, 1.484e-03, 4.064e-02, 2.481e-02, 2.225e-01));
	r += mul(s1_0, M4(4.384e-02, -1.401e-01, -4.071e-02, -1.137e-02, -4.979e-03, 6.159e-02, 1.275e-01, 6.544e-02, 1.288e-01, -4.421e-02, -4.471e-02, 2.682e-02, 5.621e-02, -4.062e-02, 1.034e-01, 6.606e-02));
	r += mul(s1_1, M4(2.799e-02, -1.333e-01, 1.521e-01, -5.025e-02, -1.895e-01, -7.913e-02, -2.321e-01, -6.526e-02, -1.330e-02, 1.499e-02, 1.620e-01, 6.936e-02, -6.816e-02, 1.353e-01, 1.107e-01, 5.514e-02));
	r += mul(s1_2, M4(5.826e-03, -5.941e-03, -2.338e-02, -2.826e-02, 9.265e-02, 3.608e-02, 1.114e-01, 1.274e-01, -1.291e-01, 1.284e-02, -7.540e-02, -3.458e-02, -1.006e-02, 2.083e-02, -8.393e-02, 8.186e-02));
	r += mul(s1_3, M4(-3.893e-02, -1.137e-02, 1.243e-01, 1.118e-01, 7.397e-02, -1.316e-01, -1.303e-01, -7.808e-05, 1.468e-02, -4.172e-03, -5.014e-02, -2.610e-02, 8.366e-02, -2.755e-02, 1.646e-04, -2.938e-02));
	r += mul(s1_4, M4(-1.114e-01, -2.017e-01, 2.898e-03, -7.984e-02, -8.403e-02, 2.626e-02, 1.563e-01, -1.397e-02, 2.724e-02, -6.698e-01, 2.358e-01, -6.466e-01, -9.650e-03, -6.742e-01, 1.411e-01, -3.343e-01));
	r += mul(s1_5, M4(7.380e-02, 6.420e-02, 7.990e-02, 6.014e-02, 5.950e-02, 6.212e-02, -7.881e-02, -2.782e-02, 1.087e-01, -3.347e-02, 3.819e-01, 1.988e-01, 5.813e-02, 2.239e-02, 3.012e-01, 1.275e-01));
	r += mul(s1_6, M4(-9.473e-02, -2.417e-02, -2.870e-02, 7.718e-02, -2.223e-02, 2.306e-02, 8.255e-03, -1.818e-02, -2.983e-02, -3.495e-02, 1.540e-02, 8.013e-02, 1.651e-02, -1.298e-02, 2.377e-02, 5.523e-02));
	r += mul(s1_7, M4(1.414e-01, 1.346e-01, 4.336e-03, -7.594e-02, -2.044e-02, -9.596e-03, -1.087e-03, 5.324e-02, -2.041e-02, -6.328e-02, 7.533e-02, -3.971e-01, 5.408e-04, 1.087e-01, 9.749e-03, -2.047e-01));
	r += mul(s1_8, M4(4.656e-02, -4.771e-02, -2.210e-02, -2.060e-02, -6.953e-03, -3.366e-02, -7.290e-03, -3.300e-02, -1.354e-01, -5.015e-02, -2.887e-02, 2.802e-01, 2.605e-02, -1.972e-02, 1.168e-03, 1.422e-01));
	r += mul(s2_0, M4(8.826e-02, -4.751e-02, 2.493e-01, 4.446e-02, 1.752e-01, 5.741e-03, -1.820e-01, 1.371e-02, -6.855e-02, 1.164e-02, -5.215e-02, -7.373e-04, -1.491e-02, 7.033e-03, -5.440e-02, -9.302e-05));
	r += mul(s2_1, M4(-6.871e-02, -9.419e-03, 3.276e-01, 2.826e-02, 5.675e-02, -3.974e-03, 1.104e-01, -2.975e-02, 3.281e-02, 8.429e-03, 1.129e-01, -4.830e-02, -4.374e-02, -6.905e-02, 8.143e-02, 3.180e-03));
	r += mul(s2_2, M4(-7.197e-02, -1.804e-02, -9.024e-02, -1.527e-03, 2.403e-02, 6.062e-02, 3.346e-02, 4.784e-02, -1.462e-02, 4.216e-02, 2.800e-02, 4.034e-04, -4.216e-02, -4.431e-03, -3.496e-02, -3.005e-02));
	r += mul(s2_3, M4(2.710e-02, -6.523e-02, 1.559e-01, -6.059e-02, 1.965e-01, -1.608e-01, -9.293e-03, -2.404e-01, -4.061e-02, 8.819e-02, 2.112e-02, 2.398e-01, -1.463e-01, 5.373e-02, -1.346e-02, 3.025e-02));
	r += mul(s2_4, M4(1.846e-02, 3.857e-01, -4.128e-01, -2.530e-01, -2.312e-01, 3.354e-02, -3.948e-01, -1.465e-01, 1.072e-01, -8.544e-02, -7.428e-02, 4.751e-02, 2.139e-01, 3.097e-01, -3.761e-01, 5.621e-02));
	r += mul(s2_5, M4(-1.203e-02, -9.598e-02, 4.101e-01, 1.578e-01, -5.394e-02, -6.714e-02, -6.320e-02, 8.249e-03, 5.620e-02, -3.219e-02, 9.398e-03, 6.809e-02, -7.400e-02, -1.431e-01, -1.425e-01, -2.358e-02));
	r += mul(s2_6, M4(4.035e-02, 5.655e-02, -3.307e-03, -3.497e-02, 6.522e-02, 1.103e-01, -9.802e-02, -2.655e-01, -5.802e-02, -4.359e-02, 3.459e-03, 1.592e-01, -2.566e-02, -1.156e-01, 2.646e-02, 3.806e-02));
	r += mul(s2_7, M4(-3.211e-02, -1.509e-01, 2.028e-03, -1.702e-01, 4.576e-02, -5.340e-02, 5.503e-02, 1.257e-02, -5.581e-02, 9.818e-02, -2.745e-02, 1.486e-01, 1.063e-01, -3.707e-01, 1.116e-01, -6.709e-02));
	r += mul(s2_8, M4(9.062e-04, -7.371e-03, 8.420e-02, 1.629e-01, -2.707e-02, -5.219e-03, 6.567e-02, 1.766e-01, 4.554e-04, 1.178e-02, -1.124e-02, 3.477e-02, -5.473e-02, -7.643e-02, 9.083e-03, -4.250e-02));
	r += mul(s3_0, M4(8.537e-02, 7.246e-02, 5.043e-02, 3.850e-02, -3.951e-02, 9.224e-03, 1.640e-02, 1.906e-02, -1.333e-01, -8.517e-02, -1.410e-01, 4.781e-02, -1.641e-02, 2.463e-03, -7.445e-02, -4.602e-02));
	r += mul(s3_1, M4(7.934e-02, 7.380e-03, -1.062e-01, -4.154e-03, -2.611e-02, -3.119e-02, 9.679e-02, -1.394e-02, -1.108e-01, 1.158e-02, 1.850e-01, -6.765e-02, 5.765e-02, 3.392e-02, -1.560e-02, -6.052e-02));
	r += mul(s3_2, M4(-3.056e-02, -8.450e-03, 1.524e-02, -1.007e-02, 1.030e-02, 4.032e-02, 9.837e-02, 9.371e-03, 4.740e-02, -4.795e-02, -3.356e-02, 8.602e-03, 2.484e-02, -9.889e-03, 6.734e-02, -2.287e-02));
	r += mul(s3_3, M4(6.303e-02, -7.328e-02, -5.082e-02, -5.070e-02, -8.129e-03, 7.948e-03, 7.351e-02, 5.601e-02, -4.169e-01, -8.219e-03, 3.373e-01, 1.781e-01, -5.139e-02, 1.471e-01, 3.415e-02, 7.690e-02));
	r += mul(s3_4, M4(1.455e-01, 6.664e-02, 5.792e-02, -1.276e-01, 2.360e-01, 5.978e-02, 5.147e-02, 2.707e-01, -7.581e-01, -2.740e-01, -3.000e-01, -2.017e-02, 2.005e-01, 6.934e-02, 2.996e-02, -3.036e-01));
	r += mul(s3_5, M4(-1.708e-01, -6.628e-02, 9.170e-02, 3.461e-02, -1.028e-01, -4.244e-02, -1.955e-01, -1.875e-01, 8.215e-02, 2.976e-02, -4.637e-03, 1.512e-01, -4.626e-02, 3.819e-03, -2.222e-01, -1.078e-01));
	r += mul(s3_6, M4(9.585e-02, 3.354e-02, -5.140e-03, -2.883e-02, -9.057e-02, 3.950e-02, -5.781e-02, -2.509e-02, -7.106e-02, -1.351e-01, -4.933e-02, 8.332e-02, -2.922e-02, -3.890e-02, 3.291e-03, 7.054e-02));
	r += mul(s3_7, M4(-8.208e-02, 1.377e-02, -2.475e-02, -1.353e-01, 9.728e-02, 7.136e-02, -3.984e-02, 1.374e-01, -1.160e-01, 4.362e-02, 6.714e-02, 5.038e-03, 1.866e-01, -2.349e-01, 8.599e-02, -1.510e-01));
	r += mul(s3_8, M4(-3.279e-02, 4.634e-02, 1.698e-02, 1.410e-01, -2.615e-02, 1.178e-02, 5.268e-02, 5.209e-02, 1.624e-02, 2.431e-02, -3.280e-02, 7.160e-02, -6.704e-02, -1.195e-01, -4.136e-02, -2.048e-01));
	r += V4(-7.279e-03, 1.016e-02, -7.400e-03, 4.979e-03);
	return r;
}

void Pass5(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
	t1[gxy] = f1(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);
}

//!PASS 6
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0, t1
//!OUT OUTPUT

#define l0(x, y) V4(O(t0, float2(x, y)))
#define l1(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8, V4 s2_0, V4 s2_1, V4 s2_2, V4 s2_3, V4 s2_4, V4 s2_5, V4 s2_6, V4 s2_7, V4 s2_8, V4 s3_0, V4 s3_1, V4 s3_2, V4 s3_3, V4 s3_4, V4 s3_5, V4 s3_6, V4 s3_7, V4 s3_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(1.838e-01, -1.901e-02, 9.627e-03, -5.113e-02, -2.616e-02, -2.850e-02, -1.739e-02, 9.125e-03, 1.563e-02, -1.253e-02, 1.902e-02, -1.512e-02, 3.495e-03, -1.497e-02, 4.974e-03, -1.115e-02));
	r += mul(s0_1, M4(-4.288e-01, 2.549e-01, -1.017e-01, -1.945e-01, 4.749e-02, 5.258e-02, -8.284e-03, -2.265e-02, -7.010e-02, -1.542e-02, 6.889e-03, 4.028e-02, -1.355e-02, 4.321e-03, 3.330e-02, 8.256e-02));
	r += mul(s0_2, M4(-1.045e-03, -2.140e-01, -3.928e-02, 1.364e-02, 1.787e-03, -9.427e-03, 1.927e-03, -2.145e-03, -4.617e-03, 2.814e-02, 2.045e-02, 1.776e-02, 3.188e-02, 3.187e-02, -1.498e-02, -2.180e-02));
	r += mul(s0_3, M4(1.059e-02, -5.522e-02, 6.447e-02, -4.395e-02, -4.846e-02, -2.209e-02, 1.866e-02, -5.920e-02, -2.419e-02, 1.032e-02, 7.736e-04, 1.092e-03, 1.854e-02, -4.388e-03, 3.893e-02, 9.549e-03));
	r += mul(s0_4, M4(3.358e-02, 7.431e-03, 1.780e-01, 5.353e-01, 2.674e-01, 2.227e-01, 1.450e-01, 2.085e-01, -8.104e-03, -3.561e-02, -1.231e-01, -1.932e-01, -6.538e-02, 3.378e-02, -1.314e-01, -2.862e-02));
	r += mul(s0_5, M4(-7.887e-02, 4.783e-02, -1.380e-01, -3.877e-01, 3.436e-03, 4.712e-02, -1.250e-02, 2.247e-02, 2.920e-02, 7.190e-02, -2.005e-02, 6.169e-02, 5.594e-03, -4.630e-02, 9.661e-02, 3.625e-02));
	r += mul(s0_6, M4(1.963e-02, -1.873e-02, 1.489e-02, -1.253e-02, -9.356e-03, 1.334e-02, -3.747e-02, -1.115e-02, 7.741e-04, -6.463e-03, 3.707e-03, -3.598e-03, 1.783e-02, -5.539e-03, 9.899e-03, -9.354e-03));
	r += mul(s0_7, M4(-2.952e-03, 3.132e-02, -6.679e-02, 2.883e-02, -3.721e-02, -3.573e-02, 1.204e-01, -6.785e-02, -1.208e-02, -1.355e-04, 2.872e-02, 2.196e-02, -1.655e-02, 3.784e-02, -5.921e-03, 2.494e-02));
	r += mul(s0_8, M4(-1.215e-02, -2.947e-02, -2.454e-03, -6.326e-02, 2.248e-03, 2.302e-02, -2.863e-03, 5.834e-02, 2.187e-02, 9.973e-03, 2.158e-02, 4.902e-02, -2.207e-02, -3.485e-02, -5.118e-02, -6.696e-02));
	r += mul(s1_0, M4(8.377e-02, -3.093e-02, 2.280e-02, -2.664e-02, -4.333e-02, -3.292e-02, -8.109e-03, 1.105e-02, 1.507e-02, 9.138e-03, 2.597e-02, -1.926e-02, 4.537e-02, -9.080e-03, -1.629e-02, -1.180e-02));
	r += mul(s1_1, M4(-7.478e-02, 1.238e-01, -5.092e-02, -3.473e-02, 4.269e-02, 4.444e-02, 7.295e-03, 1.274e-04, -1.646e-01, -1.551e-03, 3.424e-02, 4.906e-02, -2.056e-01, -5.847e-02, 5.262e-02, 1.049e-01));
	r += mul(s1_2, M4(-6.323e-02, -1.179e-01, -1.982e-02, -4.065e-02, 3.089e-03, -9.469e-03, 2.850e-03, 3.314e-03, -1.819e-03, -1.065e-01, 1.882e-02, 9.349e-03, 1.624e-02, -2.906e-02, -2.029e-02, -5.020e-02));
	r += mul(s1_3, M4(1.101e-01, -3.490e-02, 1.327e-01, -2.853e-02, -5.027e-03, -5.703e-02, 6.484e-03, -6.473e-02, -4.310e-02, 3.882e-02, -3.100e-02, -7.837e-04, -5.501e-02, -1.261e-02, 7.285e-02, 4.648e-02));
	r += mul(s1_4, M4(-6.214e-02, 1.841e-01, -9.546e-02, 3.700e-01, 2.824e-01, 3.400e-01, 2.309e-01, 2.237e-01, 3.482e-01, -1.294e-01, -4.546e-01, -3.556e-01, -4.730e-01, -1.392e-01, 4.776e-01, 1.210e-01));
	r += mul(s1_5, M4(-5.408e-02, -1.286e-01, -6.571e-02, -1.230e-01, 9.991e-03, 6.421e-02, 4.305e-03, 1.780e-02, 2.254e-02, 3.661e-01, 6.275e-02, 8.004e-02, -1.834e-02, -3.465e-01, 9.274e-02, 3.935e-01));
	r += mul(s1_6, M4(-1.620e-03, -1.423e-02, 2.785e-02, -1.252e-02, -1.218e-02, 2.842e-03, -3.496e-02, -2.927e-02, -2.106e-02, -7.099e-03, 2.545e-02, 2.484e-02, 2.973e-02, 4.563e-04, 2.010e-04, -1.839e-02));
	r += mul(s1_7, M4(1.400e-04, 3.080e-02, -6.992e-03, 8.032e-02, -2.280e-02, -4.436e-02, 7.600e-02, 1.165e-02, -9.494e-02, -2.207e-02, 2.783e-01, 2.095e-01, 2.645e-02, 5.203e-02, -7.492e-02, -1.303e-02));
	r += mul(s1_8, M4(-5.360e-03, -2.277e-02, -2.252e-02, -6.191e-02, 1.263e-02, 1.540e-02, 9.566e-03, 3.637e-02, -1.265e-02, -3.092e-02, -1.298e-02, -1.187e-02, 1.286e-02, 1.181e-02, -5.675e-02, -5.487e-02));
	r += mul(s2_0, M4(6.275e-02, 3.332e-02, 2.458e-02, -1.910e-02, -1.764e-02, 2.292e-02, -3.220e-02, -1.127e-02, 4.114e-02, 4.303e-02, -3.355e-02, -8.882e-03, 1.881e-02, 1.788e-02, -3.354e-03, -1.345e-02));
	r += mul(s2_1, M4(-1.562e-01, -7.407e-02, -5.684e-02, -3.194e-03, 1.150e-01, -2.700e-02, -9.666e-03, -3.629e-02, 5.862e-02, 6.747e-02, -1.085e-02, -3.454e-02, 7.263e-05, 2.167e-02, 5.491e-03, -6.472e-02));
	r += mul(s2_2, M4(5.068e-03, 4.899e-02, 1.480e-02, -2.153e-02, 1.102e-02, 2.831e-02, -5.931e-03, 1.021e-02, -1.267e-02, -1.569e-02, 5.418e-04, 1.030e-02, -3.280e-02, -3.072e-02, -2.688e-02, -2.208e-02));
	r += mul(s2_3, M4(-7.105e-02, 1.664e-03, -3.108e-02, 6.985e-02, 3.176e-02, 2.312e-02, -3.835e-02, 3.884e-02, -1.038e-01, 6.660e-02, -1.372e-01, 2.432e-02, -2.888e-04, -2.049e-02, 2.271e-02, 9.383e-03));
	r += mul(s2_4, M4(4.697e-01, -3.721e-01, 1.705e-01, -2.767e-01, -1.791e-02, -7.276e-02, 2.503e-01, 1.040e-01, 1.180e-02, -5.212e-01, 4.014e-01, 1.946e-01, -2.547e-02, -1.567e-02, -5.652e-02, 9.687e-02));
	r += mul(s2_5, M4(3.024e-02, 1.618e-02, 2.619e-02, 8.868e-02, -5.217e-02, -7.642e-02, -3.704e-02, -2.374e-02, -4.639e-02, 5.743e-02, -3.967e-02, -2.450e-02, 2.091e-02, -1.108e-02, 6.949e-03, -1.502e-02));
	r += mul(s2_6, M4(5.298e-03, -6.810e-03, -1.982e-02, 1.960e-04, 3.645e-03, 5.483e-03, 3.357e-03, 3.697e-02, -1.339e-02, 3.253e-02, 3.649e-02, 4.492e-03, 2.076e-02, -9.046e-03, 2.043e-02, -7.803e-03));
	r += mul(s2_7, M4(-2.707e-02, 7.878e-02, 1.816e-01, -1.506e-03, 9.060e-03, -1.418e-02, -6.983e-02, -5.833e-02, 3.309e-02, -2.537e-02, -3.298e-01, -1.735e-01, -2.132e-03, 5.241e-02, 1.155e-02, 4.817e-02));
	r += mul(s2_8, M4(-1.781e-02, 1.652e-02, -7.188e-03, 2.114e-03, -1.105e-02, -1.137e-02, -9.037e-03, -5.600e-02, -1.220e-02, 8.292e-03, -3.404e-03, -4.211e-02, -1.018e-02, -1.004e-02, 1.505e-03, -4.591e-03));
	r += mul(s3_0, M4(7.349e-02, 2.926e-02, 2.398e-02, -1.821e-02, -1.290e-02, 1.201e-02, 5.000e-03, 1.316e-02, -1.567e-02, 2.025e-02, -2.171e-02, -3.941e-04, -7.948e-03, 6.116e-02, -9.445e-03, 1.911e-02));
	r += mul(s3_1, M4(-1.294e-01, -6.121e-02, -4.576e-02, 9.211e-03, 1.371e-02, -1.964e-02, -3.133e-03, 4.701e-03, 9.544e-02, 6.692e-03, 4.665e-04, -2.056e-02, 3.455e-01, -2.495e-01, 1.027e-02, -7.393e-02));
	r += mul(s3_2, M4(1.532e-02, 9.402e-03, 6.812e-04, -3.241e-02, 1.245e-03, 6.504e-03, 3.970e-03, 7.168e-03, 9.435e-03, 1.574e-02, -5.118e-03, 5.232e-03, -2.659e-02, -6.011e-02, -2.446e-02, 8.062e-04));
	r += mul(s3_3, M4(-6.714e-02, 3.454e-03, -1.486e-02, 5.921e-02, 5.177e-02, 3.766e-02, -1.473e-01, 4.371e-02, -7.118e-02, 2.462e-02, -1.810e-02, 3.430e-02, -5.552e-02, 3.047e-02, -5.066e-02, 5.769e-02));
	r += mul(s3_4, M4(3.191e-02, -1.387e-01, -5.992e-02, -1.554e-01, 4.660e-01, 3.655e-01, 2.406e-02, -3.902e-01, 3.973e-02, -1.333e-01, 1.792e-01, 8.854e-02, 2.477e-01, -3.115e-01, 6.035e-01, -4.717e-01));
	r += mul(s3_5, M4(7.241e-02, 1.273e-01, 6.810e-02, 1.118e-01, -5.454e-02, -1.728e-02, -1.007e-01, -2.265e-02, -4.534e-02, -5.171e-02, -2.524e-02, -3.337e-02, -2.366e-03, -1.723e-02, 2.300e-02, -9.889e-02));
	r += mul(s3_6, M4(3.383e-02, -7.898e-03, 1.681e-02, -5.131e-03, 3.687e-02, 1.929e-02, -7.695e-03, 2.145e-03, -2.814e-02, 3.366e-02, -8.788e-02, 3.614e-02, 2.951e-02, -6.964e-03, 2.272e-02, 1.581e-02));
	r += mul(s3_7, M4(7.020e-03, 6.046e-02, 2.975e-02, 5.663e-02, 2.155e-02, -1.786e-02, -2.588e-01, -1.310e-01, -6.372e-02, -1.218e-01, -7.160e-02, -3.058e-01, 2.297e-03, 3.050e-02, -2.346e-02, 4.674e-02));
	r += mul(s3_8, M4(-1.071e-02, -1.089e-02, 9.286e-03, 5.202e-02, -2.291e-02, -2.655e-02, 2.386e-02, -3.231e-02, 4.599e-03, 1.114e-02, -1.630e-02, -1.693e-02, -2.194e-03, 1.842e-02, -2.522e-02, 5.265e-02));
	r += V4(-1.667e-03, -2.914e-03, -1.783e-03, -1.113e-03);
	return tanh(r);
}

void Pass6(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
	uint2 size = GetOutputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = ((gxy >> 1) + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 s2_0 = l1(-1.0, -1.0);
	V4 s2_1 = l1(0.0, -1.0);
	V4 s2_2 = l1(1.0, -1.0);
	V4 s2_3 = l1(-1.0, 0.0);
	V4 s2_4 = l1(0.0, 0.0);
	V4 s2_5 = l1(1.0, 0.0);
	V4 s2_6 = l1(-1.0, 1.0);
	V4 s2_7 = l1(0.0, 1.0);
	V4 s2_8 = l1(1.0, 1.0);
	V4 s3_0 = -max(-s2_0, 0.0);
	V4 s3_1 = -max(-s2_1, 0.0);
	V4 s3_2 = -max(-s2_2, 0.0);
	V4 s3_3 = -max(-s2_3, 0.0);
	V4 s3_4 = -max(-s2_4, 0.0);
	V4 s3_5 = -max(-s2_5, 0.0);
	V4 s3_6 = -max(-s2_6, 0.0);
	V4 s3_7 = -max(-s2_7, 0.0);
	V4 s3_8 = -max(-s2_8, 0.0);
	s2_0 = max(s2_0, 0.0);
	s2_1 = max(s2_1, 0.0);
	s2_2 = max(s2_2, 0.0);
	s2_3 = max(s2_3, 0.0);
	s2_4 = max(s2_4, 0.0);
	s2_5 = max(s2_5, 0.0);
	s2_6 = max(s2_6, 0.0);
	s2_7 = max(s2_7, 0.0);
	s2_8 = max(s2_8, 0.0);

	V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s3_0, s3_1, s3_2, s3_3, s3_4, s3_5, s3_6, s3_7, s3_8);

	static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
	static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
	float2 opt = float2(GetOutputPt());

	pos -= 0.5f * opt;
	float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);

	++gxy.x;
	pos.x += opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);

	++gxy.y;
	pos.y += opt.y;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);

	--gxy.x;
	pos.x -= opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}
